github.com/fisco-bcos/crypto@v0.0.0-20200202032121-bd8ab0b5d4f1/elliptic/p256_asm_s390x.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 #include "go_asm.h" 7 8 9 DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f 10 DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000 11 DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff 12 DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84 13 DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551 14 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256 15 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256 16 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256 17 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256 18 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 19 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 20 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0 21 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0 22 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 23 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 24 DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256 25 DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256 26 DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256 27 DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256 28 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0 29 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0 30 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0 31 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0 32 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1 33 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1 34 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0 35 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 36 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 37 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 38 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0 39 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0 40 DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256 41 DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256 42 DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256 43 DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256 44 GLOBL p256ordK0<>(SB), 8, $4 45 GLOBL p256ord<>(SB), 8, $32 46 GLOBL p256<>(SB), 8, $80 47 GLOBL p256mul<>(SB), 8, $160 48 49 DATA p256vmsl<>+0x0(SB)/8, $0x0012131415161718 50 DATA p256vmsl<>+0x8(SB)/8, $0x00191a1b1c1d1e1f 51 DATA p256vmsl<>+0x10(SB)/8, $0x0012131415161718 52 DATA p256vmsl<>+0x18(SB)/8, $0x000b0c0d0e0f1011 53 DATA p256vmsl<>+0x20(SB)/8, $0x00191a1b1c1d1e1f 54 DATA p256vmsl<>+0x28(SB)/8, $0x0012131415161718 55 DATA p256vmsl<>+0x30(SB)/8, $0x000b0c0d0e0f1011 56 DATA p256vmsl<>+0x38(SB)/8, $0x0012131415161718 57 DATA p256vmsl<>+0x40(SB)/8, $0x000405060708090a 58 DATA p256vmsl<>+0x48(SB)/8, $0x000b0c0d0e0f1011 59 DATA p256vmsl<>+0x50(SB)/8, $0x000b0c0d0e0f1011 60 DATA p256vmsl<>+0x58(SB)/8, $0x000405060708090a 61 DATA p256vmsl<>+0x60(SB)/8, $0x1010101000010203 62 DATA p256vmsl<>+0x68(SB)/8, $0x100405060708090a 63 DATA p256vmsl<>+0x70(SB)/8, $0x100405060708090a 64 DATA p256vmsl<>+0x78(SB)/8, $0x1010101000010203 65 GLOBL p256vmsl<>(SB), 8, $128 66 67 // --------------------------------------- 68 // iff cond == 1 val <- -val 69 // func p256NegCond(val *p256Point, cond int) 70 #define P1ptr R1 71 #define CPOOL R4 72 73 #define Y1L V0 74 #define Y1H V1 75 #define T1L V2 76 #define T1H V3 77 78 #define PL V30 79 #define PH V31 80 81 #define ZER V4 82 #define SEL1 V5 83 #define CAR1 V6 84 TEXT ·p256NegCond(SB), NOSPLIT, $0 85 MOVD val+0(FP), P1ptr 86 87 MOVD $p256mul<>+0x00(SB), CPOOL 88 VL 16(CPOOL), PL 89 VL 0(CPOOL), PH 90 91 VL 32(P1ptr), Y1H 92 VL 48(P1ptr), Y1L 93 94 VLREPG cond+8(FP), SEL1 95 VZERO ZER 96 VCEQG SEL1, ZER, SEL1 97 98 VSCBIQ Y1L, PL, CAR1 99 VSQ Y1L, PL, T1L 100 VSBIQ PH, Y1H, CAR1, T1H 101 102 VSEL Y1L, T1L, SEL1, Y1L 103 VSEL Y1H, T1H, SEL1, Y1H 104 105 VST Y1H, 32(P1ptr) 106 VST Y1L, 48(P1ptr) 107 RET 108 109 #undef P1ptr 110 #undef CPOOL 111 #undef Y1L 112 #undef Y1H 113 #undef T1L 114 #undef T1H 115 #undef PL 116 #undef PH 117 #undef ZER 118 #undef SEL1 119 #undef CAR1 120 121 // --------------------------------------- 122 // if cond == 0 res <- b; else res <- a 123 // func p256MovCond(res, a, b *p256Point, cond int) 124 #define P3ptr R1 125 #define P1ptr R2 126 #define P2ptr R3 127 128 #define X1L V0 129 #define X1H V1 130 #define Y1L V2 131 #define Y1H V3 132 #define Z1L V4 133 #define Z1H V5 134 #define X2L V6 135 #define X2H V7 136 #define Y2L V8 137 #define Y2H V9 138 #define Z2L V10 139 #define Z2H V11 140 141 #define ZER V18 142 #define SEL1 V19 143 TEXT ·p256MovCond(SB), NOSPLIT, $0 144 MOVD res+0(FP), P3ptr 145 MOVD a+8(FP), P1ptr 146 MOVD b+16(FP), P2ptr 147 VLREPG cond+24(FP), SEL1 148 VZERO ZER 149 VCEQG SEL1, ZER, SEL1 150 151 VL 0(P1ptr), X1H 152 VL 16(P1ptr), X1L 153 VL 32(P1ptr), Y1H 154 VL 48(P1ptr), Y1L 155 VL 64(P1ptr), Z1H 156 VL 80(P1ptr), Z1L 157 158 VL 0(P2ptr), X2H 159 VL 16(P2ptr), X2L 160 VL 32(P2ptr), Y2H 161 VL 48(P2ptr), Y2L 162 VL 64(P2ptr), Z2H 163 VL 80(P2ptr), Z2L 164 165 VSEL X2L, X1L, SEL1, X1L 166 VSEL X2H, X1H, SEL1, X1H 167 VSEL Y2L, Y1L, SEL1, Y1L 168 VSEL Y2H, Y1H, SEL1, Y1H 169 VSEL Z2L, Z1L, SEL1, Z1L 170 VSEL Z2H, Z1H, SEL1, Z1H 171 172 VST X1H, 0(P3ptr) 173 VST X1L, 16(P3ptr) 174 VST Y1H, 32(P3ptr) 175 VST Y1L, 48(P3ptr) 176 VST Z1H, 64(P3ptr) 177 VST Z1L, 80(P3ptr) 178 179 RET 180 181 #undef P3ptr 182 #undef P1ptr 183 #undef P2ptr 184 #undef X1L 185 #undef X1H 186 #undef Y1L 187 #undef Y1H 188 #undef Z1L 189 #undef Z1H 190 #undef X2L 191 #undef X2H 192 #undef Y2L 193 #undef Y2H 194 #undef Z2L 195 #undef Z2H 196 #undef ZER 197 #undef SEL1 198 199 // --------------------------------------- 200 // Constant time table access 201 // Indexed from 1 to 15, with -1 offset 202 // (index 0 is implicitly point at infinity) 203 // func p256Select(point *p256Point, table []p256Point, idx int) 204 #define P3ptr R1 205 #define P1ptr R2 206 #define COUNT R4 207 208 #define X1L V0 209 #define X1H V1 210 #define Y1L V2 211 #define Y1H V3 212 #define Z1L V4 213 #define Z1H V5 214 #define X2L V6 215 #define X2H V7 216 #define Y2L V8 217 #define Y2H V9 218 #define Z2L V10 219 #define Z2H V11 220 221 #define ONE V18 222 #define IDX V19 223 #define SEL1 V20 224 #define SEL2 V21 225 TEXT ·p256Select(SB), NOSPLIT, $0 226 MOVD point+0(FP), P3ptr 227 MOVD table+8(FP), P1ptr 228 VLREPB idx+(32+7)(FP), IDX 229 VREPIB $1, ONE 230 VREPIB $1, SEL2 231 MOVD $1, COUNT 232 233 VZERO X1H 234 VZERO X1L 235 VZERO Y1H 236 VZERO Y1L 237 VZERO Z1H 238 VZERO Z1L 239 240 loop_select: 241 VL 0(P1ptr), X2H 242 VL 16(P1ptr), X2L 243 VL 32(P1ptr), Y2H 244 VL 48(P1ptr), Y2L 245 VL 64(P1ptr), Z2H 246 VL 80(P1ptr), Z2L 247 248 VCEQG SEL2, IDX, SEL1 249 250 VSEL X2L, X1L, SEL1, X1L 251 VSEL X2H, X1H, SEL1, X1H 252 VSEL Y2L, Y1L, SEL1, Y1L 253 VSEL Y2H, Y1H, SEL1, Y1H 254 VSEL Z2L, Z1L, SEL1, Z1L 255 VSEL Z2H, Z1H, SEL1, Z1H 256 257 VAB SEL2, ONE, SEL2 258 ADDW $1, COUNT 259 ADD $96, P1ptr 260 CMPW COUNT, $17 261 BLT loop_select 262 263 VST X1H, 0(P3ptr) 264 VST X1L, 16(P3ptr) 265 VST Y1H, 32(P3ptr) 266 VST Y1L, 48(P3ptr) 267 VST Z1H, 64(P3ptr) 268 VST Z1L, 80(P3ptr) 269 RET 270 271 #undef P3ptr 272 #undef P1ptr 273 #undef COUNT 274 #undef X1L 275 #undef X1H 276 #undef Y1L 277 #undef Y1H 278 #undef Z1L 279 #undef Z1H 280 #undef X2L 281 #undef X2H 282 #undef Y2L 283 #undef Y2H 284 #undef Z2L 285 #undef Z2H 286 #undef ONE 287 #undef IDX 288 #undef SEL1 289 #undef SEL2 290 291 // --------------------------------------- 292 // Constant time table access 293 // Indexed from 1 to 15, with -1 offset 294 // (index 0 is implicitly point at infinity) 295 // func p256SelectBase(point *p256Point, table []p256Point, idx int) 296 #define P3ptr R1 297 #define P1ptr R2 298 #define COUNT R4 299 300 #define X1L V0 301 #define X1H V1 302 #define Y1L V2 303 #define Y1H V3 304 #define Z1L V4 305 #define Z1H V5 306 #define X2L V6 307 #define X2H V7 308 #define Y2L V8 309 #define Y2H V9 310 #define Z2L V10 311 #define Z2H V11 312 313 #define ONE V18 314 #define IDX V19 315 #define SEL1 V20 316 #define SEL2 V21 317 TEXT ·p256SelectBase(SB), NOSPLIT, $0 318 MOVD point+0(FP), P3ptr 319 MOVD table+8(FP), P1ptr 320 VLREPB idx+(32+7)(FP), IDX 321 VREPIB $1, ONE 322 VREPIB $1, SEL2 323 MOVD $1, COUNT 324 325 VZERO X1H 326 VZERO X1L 327 VZERO Y1H 328 VZERO Y1L 329 VZERO Z1H 330 VZERO Z1L 331 332 loop_select: 333 VL 0(P1ptr), X2H 334 VL 16(P1ptr), X2L 335 VL 32(P1ptr), Y2H 336 VL 48(P1ptr), Y2L 337 VL 64(P1ptr), Z2H 338 VL 80(P1ptr), Z2L 339 340 VCEQG SEL2, IDX, SEL1 341 342 VSEL X2L, X1L, SEL1, X1L 343 VSEL X2H, X1H, SEL1, X1H 344 VSEL Y2L, Y1L, SEL1, Y1L 345 VSEL Y2H, Y1H, SEL1, Y1H 346 VSEL Z2L, Z1L, SEL1, Z1L 347 VSEL Z2H, Z1H, SEL1, Z1H 348 349 VAB SEL2, ONE, SEL2 350 ADDW $1, COUNT 351 ADD $96, P1ptr 352 CMPW COUNT, $65 353 BLT loop_select 354 355 VST X1H, 0(P3ptr) 356 VST X1L, 16(P3ptr) 357 VST Y1H, 32(P3ptr) 358 VST Y1L, 48(P3ptr) 359 VST Z1H, 64(P3ptr) 360 VST Z1L, 80(P3ptr) 361 RET 362 363 #undef P3ptr 364 #undef P1ptr 365 #undef COUNT 366 #undef X1L 367 #undef X1H 368 #undef Y1L 369 #undef Y1H 370 #undef Z1L 371 #undef Z1H 372 #undef X2L 373 #undef X2H 374 #undef Y2L 375 #undef Y2H 376 #undef Z2L 377 #undef Z2H 378 #undef ONE 379 #undef IDX 380 #undef SEL1 381 #undef SEL2 382 383 // --------------------------------------- 384 // func p256FromMont(res, in []byte) 385 #define res_ptr R1 386 #define x_ptr R2 387 #define CPOOL R4 388 389 #define T0 V0 390 #define T1 V1 391 #define T2 V2 392 #define TT0 V3 393 #define TT1 V4 394 395 #define ZER V6 396 #define SEL1 V7 397 #define SEL2 V8 398 #define CAR1 V9 399 #define CAR2 V10 400 #define RED1 V11 401 #define RED2 V12 402 #define PL V13 403 #define PH V14 404 405 TEXT ·p256FromMont(SB), NOSPLIT, $0 406 MOVD res+0(FP), res_ptr 407 MOVD in+24(FP), x_ptr 408 409 VZERO T2 410 VZERO ZER 411 MOVD $p256<>+0x00(SB), CPOOL 412 VL 16(CPOOL), PL 413 VL 0(CPOOL), PH 414 VL 48(CPOOL), SEL2 415 VL 64(CPOOL), SEL1 416 417 VL (1*16)(x_ptr), T0 418 VL (0*16)(x_ptr), T1 419 420 // First round 421 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 422 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 423 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 424 425 VSLDB $8, T1, T0, T0 426 VSLDB $8, T2, T1, T1 427 428 VACCQ T0, RED1, CAR1 429 VAQ T0, RED1, T0 430 VACCCQ T1, RED2, CAR1, CAR2 431 VACQ T1, RED2, CAR1, T1 432 VAQ T2, CAR2, T2 433 434 // Second round 435 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 436 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 437 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 438 439 VSLDB $8, T1, T0, T0 440 VSLDB $8, T2, T1, T1 441 442 VACCQ T0, RED1, CAR1 443 VAQ T0, RED1, T0 444 VACCCQ T1, RED2, CAR1, CAR2 445 VACQ T1, RED2, CAR1, T1 446 VAQ T2, CAR2, T2 447 448 // Third round 449 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 450 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 451 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 452 453 VSLDB $8, T1, T0, T0 454 VSLDB $8, T2, T1, T1 455 456 VACCQ T0, RED1, CAR1 457 VAQ T0, RED1, T0 458 VACCCQ T1, RED2, CAR1, CAR2 459 VACQ T1, RED2, CAR1, T1 460 VAQ T2, CAR2, T2 461 462 // Last round 463 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 464 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 465 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 466 467 VSLDB $8, T1, T0, T0 468 VSLDB $8, T2, T1, T1 469 470 VACCQ T0, RED1, CAR1 471 VAQ T0, RED1, T0 472 VACCCQ T1, RED2, CAR1, CAR2 473 VACQ T1, RED2, CAR1, T1 474 VAQ T2, CAR2, T2 475 476 // --------------------------------------------------- 477 478 VSCBIQ PL, T0, CAR1 479 VSQ PL, T0, TT0 480 VSBCBIQ T1, PH, CAR1, CAR2 481 VSBIQ T1, PH, CAR1, TT1 482 VSBIQ T2, ZER, CAR2, T2 483 484 // what output to use, TT1||TT0 or T1||T0? 485 VSEL T0, TT0, T2, T0 486 VSEL T1, TT1, T2, T1 487 488 VST T0, (1*16)(res_ptr) 489 VST T1, (0*16)(res_ptr) 490 RET 491 492 #undef res_ptr 493 #undef x_ptr 494 #undef CPOOL 495 #undef T0 496 #undef T1 497 #undef T2 498 #undef TT0 499 #undef TT1 500 #undef ZER 501 #undef SEL1 502 #undef SEL2 503 #undef CAR1 504 #undef CAR2 505 #undef RED1 506 #undef RED2 507 #undef PL 508 #undef PH 509 510 // --------------------------------------- 511 // func p256OrdMul(res, in1, in2 []byte) 512 #define res_ptr R1 513 #define x_ptr R2 514 #define y_ptr R3 515 #define X0 V0 516 #define X1 V1 517 #define Y0 V2 518 #define Y1 V3 519 #define M0 V4 520 #define M1 V5 521 #define T0 V6 522 #define T1 V7 523 #define T2 V8 524 #define YDIG V9 525 526 #define ADD1 V16 527 #define ADD1H V17 528 #define ADD2 V18 529 #define ADD2H V19 530 #define RED1 V20 531 #define RED1H V21 532 #define RED2 V22 533 #define RED2H V23 534 #define CAR1 V24 535 #define CAR1M V25 536 537 #define MK0 V30 538 #define K0 V31 539 TEXT ·p256OrdMul(SB), NOSPLIT, $0 540 MOVD res+0(FP), res_ptr 541 MOVD in1+24(FP), x_ptr 542 MOVD in2+48(FP), y_ptr 543 544 VZERO T2 545 MOVD $p256ordK0<>+0x00(SB), R4 546 547 // VLEF $3, 0(R4), K0 548 WORD $0xE7F40000 549 BYTE $0x38 550 BYTE $0x03 551 MOVD $p256ord<>+0x00(SB), R4 552 VL 16(R4), M0 553 VL 0(R4), M1 554 555 VL (1*16)(x_ptr), X0 556 VL (0*16)(x_ptr), X1 557 VL (1*16)(y_ptr), Y0 558 VL (0*16)(y_ptr), Y1 559 560 // ---------------------------------------------------------------------------/ 561 VREPF $3, Y0, YDIG 562 VMLF X0, YDIG, ADD1 563 VMLF ADD1, K0, MK0 564 VREPF $3, MK0, MK0 565 566 VMLF X1, YDIG, ADD2 567 VMLHF X0, YDIG, ADD1H 568 VMLHF X1, YDIG, ADD2H 569 570 VMALF M0, MK0, ADD1, RED1 571 VMALHF M0, MK0, ADD1, RED1H 572 VMALF M1, MK0, ADD2, RED2 573 VMALHF M1, MK0, ADD2, RED2H 574 575 VSLDB $12, RED2, RED1, RED1 576 VSLDB $12, T2, RED2, RED2 577 578 VACCQ RED1, ADD1H, CAR1 579 VAQ RED1, ADD1H, T0 580 VACCQ RED1H, T0, CAR1M 581 VAQ RED1H, T0, T0 582 583 // << ready for next MK0 584 585 VACQ RED2, ADD2H, CAR1, T1 586 VACCCQ RED2, ADD2H, CAR1, CAR1 587 VACCCQ RED2H, T1, CAR1M, T2 588 VACQ RED2H, T1, CAR1M, T1 589 VAQ CAR1, T2, T2 590 591 // --------------------------------------------------- 592 /* * 593 * ---+--------+--------+ 594 * T2| T1 | T0 | 595 * ---+--------+--------+ 596 * *(add)* 597 * +--------+--------+ 598 * | X1 | X0 | 599 * +--------+--------+ 600 * *(mul)* 601 * +--------+--------+ 602 * | YDIG | YDIG | 603 * +--------+--------+ 604 * *(add)* 605 * +--------+--------+ 606 * | M1 | M0 | 607 * +--------+--------+ 608 * *(mul)* 609 * +--------+--------+ 610 * | MK0 | MK0 | 611 * +--------+--------+ 612 * 613 * --------------------- 614 * 615 * +--------+--------+ 616 * | ADD2 | ADD1 | 617 * +--------+--------+ 618 * +--------+--------+ 619 * | ADD2H | ADD1H | 620 * +--------+--------+ 621 * +--------+--------+ 622 * | RED2 | RED1 | 623 * +--------+--------+ 624 * +--------+--------+ 625 * | RED2H | RED1H | 626 * +--------+--------+ 627 */ 628 VREPF $2, Y0, YDIG 629 VMALF X0, YDIG, T0, ADD1 630 VMLF ADD1, K0, MK0 631 VREPF $3, MK0, MK0 632 633 VMALF X1, YDIG, T1, ADD2 634 VMALHF X0, YDIG, T0, ADD1H 635 VMALHF X1, YDIG, T1, ADD2H 636 637 VMALF M0, MK0, ADD1, RED1 638 VMALHF M0, MK0, ADD1, RED1H 639 VMALF M1, MK0, ADD2, RED2 640 VMALHF M1, MK0, ADD2, RED2H 641 642 VSLDB $12, RED2, RED1, RED1 643 VSLDB $12, T2, RED2, RED2 644 645 VACCQ RED1, ADD1H, CAR1 646 VAQ RED1, ADD1H, T0 647 VACCQ RED1H, T0, CAR1M 648 VAQ RED1H, T0, T0 649 650 // << ready for next MK0 651 652 VACQ RED2, ADD2H, CAR1, T1 653 VACCCQ RED2, ADD2H, CAR1, CAR1 654 VACCCQ RED2H, T1, CAR1M, T2 655 VACQ RED2H, T1, CAR1M, T1 656 VAQ CAR1, T2, T2 657 658 // --------------------------------------------------- 659 VREPF $1, Y0, YDIG 660 VMALF X0, YDIG, T0, ADD1 661 VMLF ADD1, K0, MK0 662 VREPF $3, MK0, MK0 663 664 VMALF X1, YDIG, T1, ADD2 665 VMALHF X0, YDIG, T0, ADD1H 666 VMALHF X1, YDIG, T1, ADD2H 667 668 VMALF M0, MK0, ADD1, RED1 669 VMALHF M0, MK0, ADD1, RED1H 670 VMALF M1, MK0, ADD2, RED2 671 VMALHF M1, MK0, ADD2, RED2H 672 673 VSLDB $12, RED2, RED1, RED1 674 VSLDB $12, T2, RED2, RED2 675 676 VACCQ RED1, ADD1H, CAR1 677 VAQ RED1, ADD1H, T0 678 VACCQ RED1H, T0, CAR1M 679 VAQ RED1H, T0, T0 680 681 // << ready for next MK0 682 683 VACQ RED2, ADD2H, CAR1, T1 684 VACCCQ RED2, ADD2H, CAR1, CAR1 685 VACCCQ RED2H, T1, CAR1M, T2 686 VACQ RED2H, T1, CAR1M, T1 687 VAQ CAR1, T2, T2 688 689 // --------------------------------------------------- 690 VREPF $0, Y0, YDIG 691 VMALF X0, YDIG, T0, ADD1 692 VMLF ADD1, K0, MK0 693 VREPF $3, MK0, MK0 694 695 VMALF X1, YDIG, T1, ADD2 696 VMALHF X0, YDIG, T0, ADD1H 697 VMALHF X1, YDIG, T1, ADD2H 698 699 VMALF M0, MK0, ADD1, RED1 700 VMALHF M0, MK0, ADD1, RED1H 701 VMALF M1, MK0, ADD2, RED2 702 VMALHF M1, MK0, ADD2, RED2H 703 704 VSLDB $12, RED2, RED1, RED1 705 VSLDB $12, T2, RED2, RED2 706 707 VACCQ RED1, ADD1H, CAR1 708 VAQ RED1, ADD1H, T0 709 VACCQ RED1H, T0, CAR1M 710 VAQ RED1H, T0, T0 711 712 // << ready for next MK0 713 714 VACQ RED2, ADD2H, CAR1, T1 715 VACCCQ RED2, ADD2H, CAR1, CAR1 716 VACCCQ RED2H, T1, CAR1M, T2 717 VACQ RED2H, T1, CAR1M, T1 718 VAQ CAR1, T2, T2 719 720 // --------------------------------------------------- 721 VREPF $3, Y1, YDIG 722 VMALF X0, YDIG, T0, ADD1 723 VMLF ADD1, K0, MK0 724 VREPF $3, MK0, MK0 725 726 VMALF X1, YDIG, T1, ADD2 727 VMALHF X0, YDIG, T0, ADD1H 728 VMALHF X1, YDIG, T1, ADD2H 729 730 VMALF M0, MK0, ADD1, RED1 731 VMALHF M0, MK0, ADD1, RED1H 732 VMALF M1, MK0, ADD2, RED2 733 VMALHF M1, MK0, ADD2, RED2H 734 735 VSLDB $12, RED2, RED1, RED1 736 VSLDB $12, T2, RED2, RED2 737 738 VACCQ RED1, ADD1H, CAR1 739 VAQ RED1, ADD1H, T0 740 VACCQ RED1H, T0, CAR1M 741 VAQ RED1H, T0, T0 742 743 // << ready for next MK0 744 745 VACQ RED2, ADD2H, CAR1, T1 746 VACCCQ RED2, ADD2H, CAR1, CAR1 747 VACCCQ RED2H, T1, CAR1M, T2 748 VACQ RED2H, T1, CAR1M, T1 749 VAQ CAR1, T2, T2 750 751 // --------------------------------------------------- 752 VREPF $2, Y1, YDIG 753 VMALF X0, YDIG, T0, ADD1 754 VMLF ADD1, K0, MK0 755 VREPF $3, MK0, MK0 756 757 VMALF X1, YDIG, T1, ADD2 758 VMALHF X0, YDIG, T0, ADD1H 759 VMALHF X1, YDIG, T1, ADD2H 760 761 VMALF M0, MK0, ADD1, RED1 762 VMALHF M0, MK0, ADD1, RED1H 763 VMALF M1, MK0, ADD2, RED2 764 VMALHF M1, MK0, ADD2, RED2H 765 766 VSLDB $12, RED2, RED1, RED1 767 VSLDB $12, T2, RED2, RED2 768 769 VACCQ RED1, ADD1H, CAR1 770 VAQ RED1, ADD1H, T0 771 VACCQ RED1H, T0, CAR1M 772 VAQ RED1H, T0, T0 773 774 // << ready for next MK0 775 776 VACQ RED2, ADD2H, CAR1, T1 777 VACCCQ RED2, ADD2H, CAR1, CAR1 778 VACCCQ RED2H, T1, CAR1M, T2 779 VACQ RED2H, T1, CAR1M, T1 780 VAQ CAR1, T2, T2 781 782 // --------------------------------------------------- 783 VREPF $1, Y1, YDIG 784 VMALF X0, YDIG, T0, ADD1 785 VMLF ADD1, K0, MK0 786 VREPF $3, MK0, MK0 787 788 VMALF X1, YDIG, T1, ADD2 789 VMALHF X0, YDIG, T0, ADD1H 790 VMALHF X1, YDIG, T1, ADD2H 791 792 VMALF M0, MK0, ADD1, RED1 793 VMALHF M0, MK0, ADD1, RED1H 794 VMALF M1, MK0, ADD2, RED2 795 VMALHF M1, MK0, ADD2, RED2H 796 797 VSLDB $12, RED2, RED1, RED1 798 VSLDB $12, T2, RED2, RED2 799 800 VACCQ RED1, ADD1H, CAR1 801 VAQ RED1, ADD1H, T0 802 VACCQ RED1H, T0, CAR1M 803 VAQ RED1H, T0, T0 804 805 // << ready for next MK0 806 807 VACQ RED2, ADD2H, CAR1, T1 808 VACCCQ RED2, ADD2H, CAR1, CAR1 809 VACCCQ RED2H, T1, CAR1M, T2 810 VACQ RED2H, T1, CAR1M, T1 811 VAQ CAR1, T2, T2 812 813 // --------------------------------------------------- 814 VREPF $0, Y1, YDIG 815 VMALF X0, YDIG, T0, ADD1 816 VMLF ADD1, K0, MK0 817 VREPF $3, MK0, MK0 818 819 VMALF X1, YDIG, T1, ADD2 820 VMALHF X0, YDIG, T0, ADD1H 821 VMALHF X1, YDIG, T1, ADD2H 822 823 VMALF M0, MK0, ADD1, RED1 824 VMALHF M0, MK0, ADD1, RED1H 825 VMALF M1, MK0, ADD2, RED2 826 VMALHF M1, MK0, ADD2, RED2H 827 828 VSLDB $12, RED2, RED1, RED1 829 VSLDB $12, T2, RED2, RED2 830 831 VACCQ RED1, ADD1H, CAR1 832 VAQ RED1, ADD1H, T0 833 VACCQ RED1H, T0, CAR1M 834 VAQ RED1H, T0, T0 835 836 // << ready for next MK0 837 838 VACQ RED2, ADD2H, CAR1, T1 839 VACCCQ RED2, ADD2H, CAR1, CAR1 840 VACCCQ RED2H, T1, CAR1M, T2 841 VACQ RED2H, T1, CAR1M, T1 842 VAQ CAR1, T2, T2 843 844 // --------------------------------------------------- 845 846 VZERO RED1 847 VSCBIQ M0, T0, CAR1 848 VSQ M0, T0, ADD1 849 VSBCBIQ T1, M1, CAR1, CAR1M 850 VSBIQ T1, M1, CAR1, ADD2 851 VSBIQ T2, RED1, CAR1M, T2 852 853 // what output to use, ADD2||ADD1 or T1||T0? 854 VSEL T0, ADD1, T2, T0 855 VSEL T1, ADD2, T2, T1 856 857 VST T0, (1*16)(res_ptr) 858 VST T1, (0*16)(res_ptr) 859 RET 860 861 #undef res_ptr 862 #undef x_ptr 863 #undef y_ptr 864 #undef X0 865 #undef X1 866 #undef Y0 867 #undef Y1 868 #undef M0 869 #undef M1 870 #undef T0 871 #undef T1 872 #undef T2 873 #undef YDIG 874 875 #undef ADD1 876 #undef ADD1H 877 #undef ADD2 878 #undef ADD2H 879 #undef RED1 880 #undef RED1H 881 #undef RED2 882 #undef RED2H 883 #undef CAR1 884 #undef CAR1M 885 886 #undef MK0 887 #undef K0 888 889 // --------------------------------------- 890 // p256MulInternalVX 891 // V0-V3,V30,V31 - Not Modified 892 // V4-V15 - Volatile 893 894 #define CPOOL R4 895 896 // Parameters 897 #define X0 V0 // Not modified 898 #define X1 V1 // Not modified 899 #define Y0 V2 // Not modified 900 #define Y1 V3 // Not modified 901 #define T0 V4 902 #define T1 V5 903 #define P0 V30 // Not modified 904 #define P1 V31 // Not modified 905 906 // Temporaries 907 #define YDIG V6 // Overloaded with CAR2, ZER 908 #define ADD1H V7 // Overloaded with ADD3H 909 #define ADD2H V8 // Overloaded with ADD4H 910 #define ADD3 V9 // Overloaded with SEL2,SEL5 911 #define ADD4 V10 // Overloaded with SEL3,SEL6 912 #define RED1 V11 // Overloaded with CAR2 913 #define RED2 V12 914 #define RED3 V13 // Overloaded with SEL1 915 #define T2 V14 916 // Overloaded temporaries 917 #define ADD1 V4 // Overloaded with T0 918 #define ADD2 V5 // Overloaded with T1 919 #define ADD3H V7 // Overloaded with ADD1H 920 #define ADD4H V8 // Overloaded with ADD2H 921 #define ZER V6 // Overloaded with YDIG, CAR2 922 #define CAR1 V6 // Overloaded with YDIG, ZER 923 #define CAR2 V11 // Overloaded with RED1 924 // Constant Selects 925 #define SEL1 V13 // Overloaded with RED3 926 #define SEL2 V9 // Overloaded with ADD3,SEL5 927 #define SEL3 V10 // Overloaded with ADD4,SEL6 928 #define SEL4 V6 // Overloaded with YDIG,CAR2,ZER 929 #define SEL5 V9 // Overloaded with ADD3,SEL2 930 #define SEL6 V10 // Overloaded with ADD4,SEL3 931 932 /* * 933 * To follow the flow of bits, for your own sanity a stiff drink, need you shall. 934 * Of a single round, a 'helpful' picture, here is. Meaning, column position has. 935 * With you, SIMD be... 936 * 937 * +--------+--------+ 938 * +--------| RED2 | RED1 | 939 * | +--------+--------+ 940 * | ---+--------+--------+ 941 * | +---- T2| T1 | T0 |--+ 942 * | | ---+--------+--------+ | 943 * | | | 944 * | | ======================= | 945 * | | | 946 * | | +--------+--------+<-+ 947 * | +-------| ADD2 | ADD1 |--|-----+ 948 * | | +--------+--------+ | | 949 * | | +--------+--------+<---+ | 950 * | | | ADD2H | ADD1H |--+ | 951 * | | +--------+--------+ | | 952 * | | +--------+--------+<-+ | 953 * | | | ADD4 | ADD3 |--|-+ | 954 * | | +--------+--------+ | | | 955 * | | +--------+--------+<---+ | | 956 * | | | ADD4H | ADD3H |------|-+ |(+vzero) 957 * | | +--------+--------+ | | V 958 * | | ------------------------ | | +--------+ 959 * | | | | | RED3 | [d0 0 0 d0] 960 * | | | | +--------+ 961 * | +---->+--------+--------+ | | | 962 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | | 963 * | +--------+--------+ | | | 964 * +---->---+--------+--------+ | | | 965 * T2| T1 | T0 |----+ | | 966 * ---+--------+--------+ | | | 967 * ---+--------+--------+<---+ | | 968 * +--- T2| T1 | T0 |----------+ 969 * | ---+--------+--------+ | | 970 * | +--------+--------+<-------------+ 971 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] 972 * | +--------+--------+ | | | 973 * | +--------+<----------------------+ 974 * | | RED3 |--------------+ | [0 0 d1 d0] 975 * | +--------+ | | 976 * +--->+--------+--------+ | | 977 * | T1 | T0 |--------+ 978 * +--------+--------+ | | 979 * --------------------------- | | 980 * | | 981 * +--------+--------+<----+ | 982 * | RED2 | RED1 | | 983 * +--------+--------+ | 984 * ---+--------+--------+<-------+ 985 * T2| T1 | T0 | (H1P-H1P-H00RRAY!) 986 * ---+--------+--------+ 987 * 988 * *Mi obra de arte de siglo XXI @vpaprots 989 * 990 * 991 * First group is special, doesn't get the two inputs: 992 * +--------+--------+<-+ 993 * +-------| ADD2 | ADD1 |--|-----+ 994 * | +--------+--------+ | | 995 * | +--------+--------+<---+ | 996 * | | ADD2H | ADD1H |--+ | 997 * | +--------+--------+ | | 998 * | +--------+--------+<-+ | 999 * | | ADD4 | ADD3 |--|-+ | 1000 * | +--------+--------+ | | | 1001 * | +--------+--------+<---+ | | 1002 * | | ADD4H | ADD3H |------|-+ |(+vzero) 1003 * | +--------+--------+ | | V 1004 * | ------------------------ | | +--------+ 1005 * | | | | RED3 | [d0 0 0 d0] 1006 * | | | +--------+ 1007 * +---->+--------+--------+ | | | 1008 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | | 1009 * +--------+--------+ | | | 1010 * ---+--------+--------+<---+ | | 1011 * +--- T2| T1 | T0 |----------+ 1012 * | ---+--------+--------+ | | 1013 * | +--------+--------+<-------------+ 1014 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] 1015 * | +--------+--------+ | | | 1016 * | +--------+<----------------------+ 1017 * | | RED3 |--------------+ | [0 0 d1 d0] 1018 * | +--------+ | | 1019 * +--->+--------+--------+ | | 1020 * | T1 | T0 |--------+ 1021 * +--------+--------+ | | 1022 * --------------------------- | | 1023 * | | 1024 * +--------+--------+<----+ | 1025 * | RED2 | RED1 | | 1026 * +--------+--------+ | 1027 * ---+--------+--------+<-------+ 1028 * T2| T1 | T0 | (H1P-H1P-H00RRAY!) 1029 * ---+--------+--------+ 1030 * 1031 * Last 'group' needs to RED2||RED1 shifted less 1032 */ 1033 TEXT ·p256MulInternalVX(SB), NOSPLIT, $0-0 1034 VL 32(CPOOL), SEL1 1035 VL 48(CPOOL), SEL2 1036 VL 64(CPOOL), SEL3 1037 VL 80(CPOOL), SEL4 1038 1039 // --------------------------------------------------- 1040 1041 VREPF $3, Y0, YDIG 1042 VMLHF X0, YDIG, ADD1H 1043 VMLHF X1, YDIG, ADD2H 1044 VMLF X0, YDIG, ADD1 1045 VMLF X1, YDIG, ADD2 1046 1047 VREPF $2, Y0, YDIG 1048 VMALF X0, YDIG, ADD1H, ADD3 1049 VMALF X1, YDIG, ADD2H, ADD4 1050 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 1051 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 1052 1053 VZERO ZER 1054 VL 32(CPOOL), SEL1 1055 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1056 1057 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free 1058 VSLDB $12, ZER, ADD2, T1 // ADD2 Free 1059 1060 VACCQ T0, ADD3, CAR1 1061 VAQ T0, ADD3, T0 // ADD3 Free 1062 VACCCQ T1, ADD4, CAR1, T2 1063 VACQ T1, ADD4, CAR1, T1 // ADD4 Free 1064 1065 VL 48(CPOOL), SEL2 1066 VL 64(CPOOL), SEL3 1067 VL 80(CPOOL), SEL4 1068 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1069 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 1070 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 1071 VSQ RED3, RED2, RED2 // Guaranteed not to underflow 1072 1073 VSLDB $12, T1, T0, T0 1074 VSLDB $12, T2, T1, T1 1075 1076 VACCQ T0, ADD3H, CAR1 1077 VAQ T0, ADD3H, T0 1078 VACCCQ T1, ADD4H, CAR1, T2 1079 VACQ T1, ADD4H, CAR1, T1 1080 1081 // --------------------------------------------------- 1082 1083 VREPF $1, Y0, YDIG 1084 VMALHF X0, YDIG, T0, ADD1H 1085 VMALHF X1, YDIG, T1, ADD2H 1086 VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1 1087 VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2 1088 1089 VREPF $0, Y0, YDIG 1090 VMALF X0, YDIG, ADD1H, ADD3 1091 VMALF X1, YDIG, ADD2H, ADD4 1092 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H 1093 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER 1094 1095 VZERO ZER 1096 VL 32(CPOOL), SEL1 1097 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1098 1099 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0 1100 VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free 1101 1102 VACCQ T0, RED1, CAR1 1103 VAQ T0, RED1, T0 1104 VACCCQ T1, RED2, CAR1, T2 1105 VACQ T1, RED2, CAR1, T1 1106 1107 VACCQ T0, ADD3, CAR1 1108 VAQ T0, ADD3, T0 1109 VACCCQ T1, ADD4, CAR1, CAR2 1110 VACQ T1, ADD4, CAR1, T1 1111 VAQ T2, CAR2, T2 1112 1113 VL 48(CPOOL), SEL2 1114 VL 64(CPOOL), SEL3 1115 VL 80(CPOOL), SEL4 1116 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1117 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 1118 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 1119 VSQ RED3, RED2, RED2 // Guaranteed not to underflow 1120 1121 VSLDB $12, T1, T0, T0 1122 VSLDB $12, T2, T1, T1 1123 1124 VACCQ T0, ADD3H, CAR1 1125 VAQ T0, ADD3H, T0 1126 VACCCQ T1, ADD4H, CAR1, T2 1127 VACQ T1, ADD4H, CAR1, T1 1128 1129 // --------------------------------------------------- 1130 1131 VREPF $3, Y1, YDIG 1132 VMALHF X0, YDIG, T0, ADD1H 1133 VMALHF X1, YDIG, T1, ADD2H 1134 VMALF X0, YDIG, T0, ADD1 1135 VMALF X1, YDIG, T1, ADD2 1136 1137 VREPF $2, Y1, YDIG 1138 VMALF X0, YDIG, ADD1H, ADD3 1139 VMALF X1, YDIG, ADD2H, ADD4 1140 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 1141 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 1142 1143 VZERO ZER 1144 VL 32(CPOOL), SEL1 1145 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1146 1147 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free 1148 VSLDB $12, T2, ADD2, T1 // ADD2 Free 1149 1150 VACCQ T0, RED1, CAR1 1151 VAQ T0, RED1, T0 1152 VACCCQ T1, RED2, CAR1, T2 1153 VACQ T1, RED2, CAR1, T1 1154 1155 VACCQ T0, ADD3, CAR1 1156 VAQ T0, ADD3, T0 1157 VACCCQ T1, ADD4, CAR1, CAR2 1158 VACQ T1, ADD4, CAR1, T1 1159 VAQ T2, CAR2, T2 1160 1161 VL 48(CPOOL), SEL2 1162 VL 64(CPOOL), SEL3 1163 VL 80(CPOOL), SEL4 1164 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1165 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 1166 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 1167 VSQ RED3, RED2, RED2 // Guaranteed not to underflow 1168 1169 VSLDB $12, T1, T0, T0 1170 VSLDB $12, T2, T1, T1 1171 1172 VACCQ T0, ADD3H, CAR1 1173 VAQ T0, ADD3H, T0 1174 VACCCQ T1, ADD4H, CAR1, T2 1175 VACQ T1, ADD4H, CAR1, T1 1176 1177 // --------------------------------------------------- 1178 1179 VREPF $1, Y1, YDIG 1180 VMALHF X0, YDIG, T0, ADD1H 1181 VMALHF X1, YDIG, T1, ADD2H 1182 VMALF X0, YDIG, T0, ADD1 1183 VMALF X1, YDIG, T1, ADD2 1184 1185 VREPF $0, Y1, YDIG 1186 VMALF X0, YDIG, ADD1H, ADD3 1187 VMALF X1, YDIG, ADD2H, ADD4 1188 VMALHF X0, YDIG, ADD1H, ADD3H 1189 VMALHF X1, YDIG, ADD2H, ADD4H 1190 1191 VZERO ZER 1192 VL 32(CPOOL), SEL1 1193 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1194 1195 VSLDB $12, ADD2, ADD1, T0 1196 VSLDB $12, T2, ADD2, T1 1197 1198 VACCQ T0, RED1, CAR1 1199 VAQ T0, RED1, T0 1200 VACCCQ T1, RED2, CAR1, T2 1201 VACQ T1, RED2, CAR1, T1 1202 1203 VACCQ T0, ADD3, CAR1 1204 VAQ T0, ADD3, T0 1205 VACCCQ T1, ADD4, CAR1, CAR2 1206 VACQ T1, ADD4, CAR1, T1 1207 VAQ T2, CAR2, T2 1208 1209 VL 96(CPOOL), SEL5 1210 VL 112(CPOOL), SEL6 1211 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0] 1212 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0] 1213 VSQ RED1, RED2, RED2 // Guaranteed not to underflow 1214 1215 VSLDB $12, T1, T0, T0 1216 VSLDB $12, T2, T1, T1 1217 1218 VACCQ T0, ADD3H, CAR1 1219 VAQ T0, ADD3H, T0 1220 VACCCQ T1, ADD4H, CAR1, T2 1221 VACQ T1, ADD4H, CAR1, T1 1222 1223 VACCQ T0, RED1, CAR1 1224 VAQ T0, RED1, T0 1225 VACCCQ T1, RED2, CAR1, CAR2 1226 VACQ T1, RED2, CAR1, T1 1227 VAQ T2, CAR2, T2 1228 1229 // --------------------------------------------------- 1230 1231 VZERO RED3 1232 VSCBIQ P0, T0, CAR1 1233 VSQ P0, T0, ADD1H 1234 VSBCBIQ T1, P1, CAR1, CAR2 1235 VSBIQ T1, P1, CAR1, ADD2H 1236 VSBIQ T2, RED3, CAR2, T2 1237 1238 // what output to use, ADD2H||ADD1H or T1||T0? 1239 VSEL T0, ADD1H, T2, T0 1240 VSEL T1, ADD2H, T2, T1 1241 RET 1242 1243 #undef CPOOL 1244 1245 #undef X0 1246 #undef X1 1247 #undef Y0 1248 #undef Y1 1249 #undef T0 1250 #undef T1 1251 #undef P0 1252 #undef P1 1253 1254 #undef SEL1 1255 #undef SEL2 1256 #undef SEL3 1257 #undef SEL4 1258 #undef SEL5 1259 #undef SEL6 1260 1261 #undef YDIG 1262 #undef ADD1H 1263 #undef ADD2H 1264 #undef ADD3 1265 #undef ADD4 1266 #undef RED1 1267 #undef RED2 1268 #undef RED3 1269 #undef T2 1270 #undef ADD1 1271 #undef ADD2 1272 #undef ADD3H 1273 #undef ADD4H 1274 #undef ZER 1275 #undef CAR1 1276 #undef CAR2 1277 1278 // --------------------------------------- 1279 // p256MulInternalVMSL 1280 // V0-V3,V30,V31 - Not Modified 1281 // V4-V14 - Volatile 1282 1283 #define CPOOL R4 1284 #define SCRATCH R9 1285 1286 // Parameters 1287 #define X0 V0 // Not modified 1288 #define X1 V1 // Not modified 1289 #define Y0 V2 // Not modified 1290 #define Y1 V3 // Not modified 1291 #define T0 V4 1292 #define T1 V5 1293 #define T2 V6 1294 #define P0 V30 // Not modified 1295 #define P1 V31 // Not modified 1296 1297 // input: d0 1298 // output: h0, h1 1299 // temp: TEMP, ZERO, BORROW 1300 #define OBSERVATION3(d0, h0, h1, TEMP, ZERO, BORROW) \ 1301 VZERO ZERO \ 1302 VSLDB $4, d0, ZERO, h0 \ 1303 VLR h0, BORROW \ 1304 VSLDB $12, ZERO, h0, TEMP \ 1305 VSQ TEMP, h0, h0 \ 1306 VSLDB $12, d0, BORROW, h1 \ 1307 VSLDB $8, ZERO, BORROW, TEMP \ 1308 VAQ TEMP, h0, h0 \ 1309 1310 #define OBSERVATION3A(d2, h0, h1, TEMP, ZERO) \ 1311 VZERO ZERO \ 1312 VSLDB $8, d2, ZERO, TEMP \ 1313 VSLDB $8, d2, TEMP, h0 \ 1314 VSLDB $12, ZERO, TEMP, h1 \ 1315 VSQ h1, h0, h0 \ 1316 1317 TEXT ·p256MulInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0 1318 VSTM V16, V19, (SCRATCH) 1319 1320 MOVD $p256vmsl<>+0x00(SB), CPOOL 1321 1322 // Divide input1 into 5 limbs 1323 VGBM $0x007f, V14 1324 VZERO V12 1325 VSLDB $2, X1, X0, V13 1326 VSLDB $2, Y1, Y0, V8 1327 VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb 1328 VSLDB $4, V12, Y1, V6 // V6: 4 bytes limb 1329 1330 VN V14, X0, V5 // V5: first 7 bytes limb 1331 VN V14, Y0, V10 // V10: first 7 bytes limb 1332 VN V14, V13, V13 // v13: third 7 bytes limb 1333 VN V14, V8, V8 // V8: third 7 bytes limb 1334 1335 VMSLG V10, V5, V12, V10 // v10: l10 x l5 (column 1) 1336 VMSLG V8, V5, V12, V8 // v8: l8 x l5 1337 VMSLG V6, V13, V12, V13 // v13: l6 x l3 1338 VMSLG V6, V11, V12, V11 // v11: l6 x l1 (column 9) 1339 VMSLG V6, V5, V12, V6 // v6: l6 x l5 1340 1341 MOVD $p256vmsl<>+0x00(SB), CPOOL 1342 VGBM $0x7f7f, V14 1343 1344 VL 0(CPOOL), V4 1345 VL 16(CPOOL), V7 1346 VL 32(CPOOL), V9 1347 VL 48(CPOOL), V5 1348 VLM 64(CPOOL), V16, V19 1349 1350 VPERM V12, X0, V4, V4 // v4: limb4 | limb5 1351 VPERM Y1, Y0, V7, V7 1352 VPERM V12, Y0, V9, V9 // v9: limb10 | limb9 1353 VPERM X1, X0, V5, V5 1354 VPERM X1, X0, V16, V16 1355 VPERM Y1, Y0, V17, V17 1356 VPERM X1, V12, V18, V18 // v18: limb1 | limb2 1357 VPERM Y1, V12, V19, V19 // v19: limb7 | limb6 1358 VN V14, V7, V7 // v7: limb9 | limb8 1359 VN V14, V5, V5 // v5: limb3 | limb4 1360 VN V14, V16, V16 // v16: limb2 | limb3 1361 VN V14, V17, V17 // v17: limb8 | limb7 1362 1363 VMSLG V9, V4, V12, V14 // v14: l10 x l4 + l9 x l5 (column 2) 1364 VMSLG V9, V5, V8, V8 // v8: l10 x l9 + l3 x l4 + l8 x l5 (column 3) 1365 VMSLG V9, V16, V12, V16 // v16: l10 x l9 + l2 x l3 1366 VMSLG V9, V18, V12, V9 // v9: l10 x l1 + l9 x l2 1367 VMSLG V7, V18, V12, V7 // v7: l9 x l1 + l8 x l2 1368 VMSLG V17, V4, V16, V16 // v16: l8 x l4 + l7 x l5 + l10 x l9 + l2 x l3 (column 4) 1369 VMSLG V17, V5, V9, V9 // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 1370 VMSLG V17, V18, V12, V17 // v18: l8 x l1 + l7 x l2 1371 VMSLG V19, V5, V7, V7 // v7: l9 x l1 + l8 x l2 + l7 x l3 + l6 x l4 (column 6) 1372 VMSLG V19, V18, V12, V19 // v19: l7 x l1 + l6 x l2 (column 8) 1373 VAQ V9, V6, V9 // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5) 1374 VAQ V17, V13, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7) 1375 1376 VSLDB $9, V12, V10, V4 1377 VSLDB $9, V12, V7, V5 1378 VAQ V4, V14, V14 1379 VAQ V5, V13, V13 1380 1381 VSLDB $9, V12, V14, V4 1382 VSLDB $9, V12, V13, V5 1383 VAQ V4, V8, V8 1384 VAQ V5, V19, V19 1385 1386 VSLDB $9, V12, V8, V4 1387 VSLDB $9, V12, V19, V5 1388 VAQ V4, V16, V16 1389 VAQ V5, V11, V11 1390 1391 VSLDB $9, V12, V16, V4 1392 VAQ V4, V9, V17 1393 1394 VGBM $0x007f, V4 1395 VGBM $0x00ff, V5 1396 1397 VN V10, V4, V10 1398 VN V14, V4, V14 1399 VN V8, V4, V8 1400 VN V16, V4, V16 1401 VN V17, V4, V9 1402 VN V7, V4, V7 1403 VN V13, V4, V13 1404 VN V19, V4, V19 1405 VN V11, V5, V11 1406 1407 VSLDB $7, V14, V14, V14 1408 VSLDB $14, V8, V12, V4 1409 VSLDB $14, V12, V8, V8 1410 VSLDB $5, V16, V16, V16 1411 VSLDB $12, V9, V12, V5 1412 1413 VO V14, V10, V10 1414 VO V8, V16, V16 1415 VO V4, V10, V10 // first rightmost 128bits of the multiplication result 1416 VO V5, V16, V16 // second rightmost 128bits of the multiplication result 1417 1418 // adjust v7, v13, v19, v11 1419 VSLDB $7, V13, V13, V13 1420 VSLDB $14, V19, V12, V4 1421 VSLDB $14, V12, V19, V19 1422 VSLDB $5, V11, V12, V5 1423 VO V13, V7, V7 1424 VO V4, V7, V7 1425 VO V19, V5, V11 1426 1427 VSLDB $9, V12, V17, V14 1428 VSLDB $12, V12, V9, V9 1429 VACCQ V7, V14, V13 1430 VAQ V7, V14, V7 1431 VAQ V11, V13, V11 1432 1433 // First reduction, 96 bits 1434 VSLDB $4, V16, V10, T0 1435 VSLDB $4, V12, V16, T1 1436 VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result 1437 VSLDB $3, V7, V12, V7 1438 OBSERVATION3(V10, V8, T2, V17, V18, V19)// results V8 | T2 1439 VO V7, V9, V7 // third rightmost 128bits of the multiplication result 1440 VACCQ T0, T2, V9 1441 VAQ T0, T2, T2 1442 VACQ T1, V8, V9, V8 1443 1444 // Second reduction 96 bits 1445 VSLDB $4, V8, T2, T0 1446 VSLDB $4, V12, V8, T1 1447 OBSERVATION3(T2, V9, V8, V17, V18, V19)// results V9 | V8 1448 VACCQ T0, V8, T2 1449 VAQ T0, V8, V8 1450 VACQ T1, V9, T2, V9 1451 1452 // Third reduction 64 bits 1453 VSLDB $8, V9, V8, T0 1454 VSLDB $8, V12, V9, T1 1455 OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13 1456 VACCQ T0, V13, V12 1457 VAQ T0, V13, V13 1458 VACQ T1, V14, V12, V14 1459 VACCQ V13, V7, V12 1460 VAQ V13, V7, T0 1461 VACCCQ V14, V11, V12, T2 1462 VACQ V14, V11, V12, T1 // results T2 | T1 | T0 1463 1464 // --------------------------------------------------- 1465 MOVD $p256mul<>+0x00(SB), CPOOL 1466 1467 VZERO V12 1468 VSCBIQ P0, T0, V8 1469 VSQ P0, T0, V7 1470 VSBCBIQ T1, P1, V8, V10 1471 VSBIQ T1, P1, V8, V9 1472 VSBIQ T2, V12, V10, T2 1473 1474 // what output to use, V9||V7 or T1||T0? 1475 VSEL T0, V7, T2, T0 1476 VSEL T1, V9, T2, T1 1477 1478 VLM (SCRATCH), V16, V19 1479 1480 RET 1481 1482 // --------------------------------------- 1483 // p256SqrInternalVMSL 1484 // V0-V1,V30,V31 - Not Modified 1485 // V4-V14 - Volatile 1486 1487 TEXT ·p256SqrInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0 1488 VSTM V16, V18, (SCRATCH) 1489 1490 MOVD $p256vmsl<>+0x00(SB), CPOOL 1491 // Divide input into limbs 1492 VGBM $0x007f, V14 1493 VZERO V12 1494 VSLDB $2, X1, X0, V13 1495 VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb 1496 1497 VN V14, X0, V10 // V10: first 7 bytes limb 1498 VN V14, V13, V13 // v13: third 7 bytes limb 1499 1500 VMSLG V10, V10, V12, V10 // v10: l10 x l5 (column 1) 1501 VMSLG V13, V13, V12, V13 // v13: l8 x l3 1502 VMSLG V11, V11, V12, V11 // v11: l6 x l1 (column 9) 1503 1504 MOVD $p256vmsl<>+0x00(SB), CPOOL 1505 VGBM $0x7f7f, V14 1506 1507 VL 0(CPOOL), V4 1508 VL 16(CPOOL), V7 1509 VL 32(CPOOL), V9 1510 VL 48(CPOOL), V5 1511 VLM 64(CPOOL), V16, V18 1512 VL 112(CPOOL), V8 1513 1514 VPERM V12, X0, V4, V4 // v4: limb4 | limb5 1515 VPERM X1, X0, V7, V7 1516 VPERM V12, X0, V9, V9 // v9: limb10 | limb9 1517 VPERM X1, X0, V5, V5 1518 VPERM X1, X0, V16, V16 1519 VPERM X1, X0, V17, V17 1520 VPERM X1, V12, V18, V18 // v18: limb1 | limb2 1521 VPERM X1, V12, V8, V8 // v8: limb7 | limb6 1522 VN V14, V7, V7 // v7: limb9 | limb8 1523 VN V14, V5, V5 // v5: limb3 | limb4 1524 VN V14, V16, V16 // v16: limb2 | limb3 1525 VN V14, V17, V17 // v17: limb8 | limb7 1526 1527 VMSLEOG V9, V18, V13, V6 // v6: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5) 1528 VMSLG V9, V4, V12, V14 // v14: l10 x l4 + l9 x l5 (column 2) 1529 VMSLEOG V9, V16, V12, V16 // v16: l10 x l2 + l9 x l3 + l8 x l4 + l7 x l5 (column 4) 1530 VMSLEOG V7, V18, V12, V7 // v7: l9 x l1 + l8 x l2 (column 6) 1531 VMSLEG V17, V18, V12, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7) 1532 VMSLG V8, V18, V12, V8 // v8: l7 x l1 + l6 x l2 (column 8) 1533 VMSLEG V9, V5, V12, V18 // v18: l10 x l3 + l9 x l4 + l8 x l5 (column 3) 1534 1535 VSLDB $9, V12, V10, V4 1536 VSLDB $9, V12, V7, V5 1537 VAQ V4, V14, V14 1538 VAQ V5, V13, V13 1539 1540 VSLDB $9, V12, V14, V4 1541 VSLDB $9, V12, V13, V5 1542 VAQ V4, V18, V18 1543 VAQ V5, V8, V8 1544 1545 VSLDB $9, V12, V18, V4 1546 VSLDB $9, V12, V8, V5 1547 VAQ V4, V16, V16 1548 VAQ V5, V11, V11 1549 1550 VSLDB $9, V12, V16, V4 1551 VAQ V4, V6, V17 1552 1553 VGBM $0x007f, V4 1554 VGBM $0x00ff, V5 1555 1556 VN V10, V4, V10 1557 VN V14, V4, V14 1558 VN V18, V4, V18 1559 VN V16, V4, V16 1560 VN V17, V4, V9 1561 VN V7, V4, V7 1562 VN V13, V4, V13 1563 VN V8, V4, V8 1564 VN V11, V5, V11 1565 1566 VSLDB $7, V14, V14, V14 1567 VSLDB $14, V18, V12, V4 1568 VSLDB $14, V12, V18, V18 1569 VSLDB $5, V16, V16, V16 1570 VSLDB $12, V9, V12, V5 1571 1572 VO V14, V10, V10 1573 VO V18, V16, V16 1574 VO V4, V10, V10 // first rightmost 128bits of the multiplication result 1575 VO V5, V16, V16 // second rightmost 128bits of the multiplication result 1576 1577 // adjust v7, v13, v8, v11 1578 VSLDB $7, V13, V13, V13 1579 VSLDB $14, V8, V12, V4 1580 VSLDB $14, V12, V8, V8 1581 VSLDB $5, V11, V12, V5 1582 VO V13, V7, V7 1583 VO V4, V7, V7 1584 VO V8, V5, V11 1585 1586 VSLDB $9, V12, V17, V14 1587 VSLDB $12, V12, V9, V9 1588 VACCQ V7, V14, V13 1589 VAQ V7, V14, V7 1590 VAQ V11, V13, V11 1591 1592 // First reduction, 96 bits 1593 VSLDB $4, V16, V10, T0 1594 VSLDB $4, V12, V16, T1 1595 VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result 1596 VSLDB $3, V7, V12, V7 1597 OBSERVATION3(V10, V8, T2, V16, V17, V18)// results V8 | T2 1598 VO V7, V9, V7 // third rightmost 128bits of the multiplication result 1599 VACCQ T0, T2, V9 1600 VAQ T0, T2, T2 1601 VACQ T1, V8, V9, V8 1602 1603 // Second reduction 96 bits 1604 VSLDB $4, V8, T2, T0 1605 VSLDB $4, V12, V8, T1 1606 OBSERVATION3(T2, V9, V8, V16, V17, V18)// results V9 | V8 1607 VACCQ T0, V8, T2 1608 VAQ T0, V8, V8 1609 VACQ T1, V9, T2, V9 1610 1611 // Third reduction 64 bits 1612 VSLDB $8, V9, V8, T0 1613 VSLDB $8, V12, V9, T1 1614 OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13 1615 VACCQ T0, V13, V12 1616 VAQ T0, V13, V13 1617 VACQ T1, V14, V12, V14 1618 VACCQ V13, V7, V12 1619 VAQ V13, V7, T0 1620 VACCCQ V14, V11, V12, T2 1621 VACQ V14, V11, V12, T1 // results T2 | T1 | T0 1622 1623 // --------------------------------------------------- 1624 MOVD $p256mul<>+0x00(SB), CPOOL 1625 1626 VZERO V12 1627 VSCBIQ P0, T0, V8 1628 VSQ P0, T0, V7 1629 VSBCBIQ T1, P1, V8, V10 1630 VSBIQ T1, P1, V8, V9 1631 VSBIQ T2, V12, V10, T2 1632 1633 // what output to use, V9||V7 or T1||T0? 1634 VSEL T0, V7, T2, T0 1635 VSEL T1, V9, T2, T1 1636 1637 VLM (SCRATCH), V16, V18 1638 RET 1639 1640 1641 1642 #undef CPOOL 1643 #undef SCRATCH 1644 #undef X0 1645 #undef X1 1646 #undef Y0 1647 #undef Y1 1648 #undef T0 1649 #undef T1 1650 #undef T2 1651 #undef P0 1652 #undef P1 1653 1654 #define SCRATCH R9 1655 1656 TEXT p256MulInternal<>(SB),NOSPLIT,$64-0 1657 MOVD $scratch-64(SP), SCRATCH 1658 MOVD ·p256MulInternalFacility+0x00(SB),R7 1659 CALL (R7) 1660 RET 1661 1662 TEXT ·p256MulInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0 1663 MOVBZ internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0 1664 MOVD $·p256MulInternalFacility+0x00(SB), R7 1665 MOVD $·p256MulInternalVX(SB), R8 1666 CMPBEQ R0, $0, novmsl // VE1 facility = 1, VMSL supported 1667 MOVD $·p256MulInternalVMSL(SB), R8 1668 novmsl: 1669 MOVD R8, 0(R7) 1670 BR (R8) 1671 1672 GLOBL ·p256MulInternalFacility+0x00(SB), NOPTR, $8 1673 DATA ·p256MulInternalFacility+0x00(SB)/8, $·p256MulInternalTrampolineSetup(SB) 1674 1675 // Parameters 1676 #define X0 V0 1677 #define X1 V1 1678 #define Y0 V2 1679 #define Y1 V3 1680 1681 TEXT ·p256SqrInternalVX(SB), NOFRAME|NOSPLIT, $0 1682 VLR X0, Y0 1683 VLR X1, Y1 1684 BR ·p256MulInternalVX(SB) 1685 1686 #undef X0 1687 #undef X1 1688 #undef Y0 1689 #undef Y1 1690 1691 1692 TEXT p256SqrInternal<>(SB),NOSPLIT,$48-0 1693 MOVD $scratch-48(SP), SCRATCH 1694 MOVD ·p256SqrInternalFacility+0x00(SB),R7 1695 CALL (R7) 1696 RET 1697 1698 TEXT ·p256SqrInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0 1699 MOVBZ internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0 1700 MOVD $·p256SqrInternalFacility+0x00(SB), R7 1701 MOVD $·p256SqrInternalVX(SB), R8 1702 CMPBEQ R0, $0, novmsl // VE1 facility = 1, VMSL supported 1703 MOVD $·p256SqrInternalVMSL(SB), R8 1704 novmsl: 1705 MOVD R8, 0(R7) 1706 BR (R8) 1707 1708 1709 GLOBL ·p256SqrInternalFacility+0x00(SB), NOPTR, $8 1710 DATA ·p256SqrInternalFacility+0x00(SB)/8, $·p256SqrInternalTrampolineSetup(SB) 1711 1712 #undef SCRATCH 1713 1714 1715 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ 1716 VZERO ZER \ 1717 VSCBIQ Y0, X0, CAR1 \ 1718 VSQ Y0, X0, T0 \ 1719 VSBCBIQ X1, Y1, CAR1, SEL1 \ 1720 VSBIQ X1, Y1, CAR1, T1 \ 1721 VSQ SEL1, ZER, SEL1 \ 1722 \ 1723 VACCQ T0, PL, CAR1 \ 1724 VAQ T0, PL, TT0 \ 1725 VACQ T1, PH, CAR1, TT1 \ 1726 \ 1727 VSEL T0, TT0, SEL1, T0 \ 1728 VSEL T1, TT1, SEL1, T1 \ 1729 1730 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ 1731 VACCQ X0, Y0, CAR1 \ 1732 VAQ X0, Y0, T0 \ 1733 VACCCQ X1, Y1, CAR1, T2 \ 1734 VACQ X1, Y1, CAR1, T1 \ 1735 \ 1736 VZERO ZER \ 1737 VSCBIQ PL, T0, CAR1 \ 1738 VSQ PL, T0, TT0 \ 1739 VSBCBIQ T1, PH, CAR1, CAR2 \ 1740 VSBIQ T1, PH, CAR1, TT1 \ 1741 VSBIQ T2, ZER, CAR2, SEL1 \ 1742 \ 1743 VSEL T0, TT0, SEL1, T0 \ 1744 VSEL T1, TT1, SEL1, T1 1745 1746 #define p256HalfInternal(T1, T0, X1, X0) \ 1747 VZERO ZER \ 1748 VSBIQ ZER, ZER, X0, SEL1 \ 1749 \ 1750 VACCQ X0, PL, CAR1 \ 1751 VAQ X0, PL, T0 \ 1752 VACCCQ X1, PH, CAR1, T2 \ 1753 VACQ X1, PH, CAR1, T1 \ 1754 \ 1755 VSEL X0, T0, SEL1, T0 \ 1756 VSEL X1, T1, SEL1, T1 \ 1757 VSEL ZER, T2, SEL1, T2 \ 1758 \ 1759 VSLDB $15, T2, ZER, TT1 \ 1760 VSLDB $15, T1, ZER, TT0 \ 1761 VREPIB $1, SEL1 \ 1762 VSRL SEL1, T0, T0 \ 1763 VSRL SEL1, T1, T1 \ 1764 VREPIB $7, SEL1 \ 1765 VSL SEL1, TT0, TT0 \ 1766 VSL SEL1, TT1, TT1 \ 1767 VO T0, TT0, T0 \ 1768 VO T1, TT1, T1 1769 1770 // --------------------------------------- 1771 // func p256MulAsm(res, in1, in2 []byte) 1772 #define res_ptr R1 1773 #define x_ptr R2 1774 #define y_ptr R3 1775 #define CPOOL R4 1776 1777 // Parameters 1778 #define X0 V0 1779 #define X1 V1 1780 #define Y0 V2 1781 #define Y1 V3 1782 #define T0 V4 1783 #define T1 V5 1784 1785 // Constants 1786 #define P0 V30 1787 #define P1 V31 1788 TEXT ·p256MulAsm(SB), NOSPLIT, $0 1789 MOVD res+0(FP), res_ptr 1790 MOVD in1+24(FP), x_ptr 1791 MOVD in2+48(FP), y_ptr 1792 1793 VL (1*16)(x_ptr), X0 1794 VL (0*16)(x_ptr), X1 1795 VL (1*16)(y_ptr), Y0 1796 VL (0*16)(y_ptr), Y1 1797 1798 MOVD $p256mul<>+0x00(SB), CPOOL 1799 VL 16(CPOOL), P0 1800 VL 0(CPOOL), P1 1801 1802 CALL p256MulInternal<>(SB) 1803 1804 VST T0, (1*16)(res_ptr) 1805 VST T1, (0*16)(res_ptr) 1806 RET 1807 1808 #undef res_ptr 1809 #undef x_ptr 1810 #undef y_ptr 1811 #undef CPOOL 1812 1813 #undef X0 1814 #undef X1 1815 #undef Y0 1816 #undef Y1 1817 #undef T0 1818 #undef T1 1819 #undef P0 1820 #undef P1 1821 1822 // --------------------------------------- 1823 // func p256SqrAsm(res, in1 []byte) 1824 #define res_ptr R1 1825 #define x_ptr R2 1826 #define y_ptr R3 1827 #define CPOOL R4 1828 1829 // Parameters 1830 #define X0 V0 1831 #define X1 V1 1832 #define T0 V4 1833 #define T1 V5 1834 1835 // Constants 1836 #define P0 V30 1837 #define P1 V31 1838 TEXT ·p256SqrAsm(SB), NOSPLIT, $0 1839 MOVD res+0(FP), res_ptr 1840 MOVD in1+24(FP), x_ptr 1841 1842 VL (1*16)(x_ptr), X0 1843 VL (0*16)(x_ptr), X1 1844 1845 MOVD $p256mul<>+0x00(SB), CPOOL 1846 VL 16(CPOOL), P0 1847 VL 0(CPOOL), P1 1848 1849 CALL p256SqrInternal<>(SB) 1850 1851 VST T0, (1*16)(res_ptr) 1852 VST T1, (0*16)(res_ptr) 1853 RET 1854 1855 #undef res_ptr 1856 #undef x_ptr 1857 #undef y_ptr 1858 #undef CPOOL 1859 1860 #undef X0 1861 #undef X1 1862 #undef T0 1863 #undef T1 1864 #undef P0 1865 #undef P1 1866 1867 1868 // Point add with P2 being affine point 1869 // If sign == 1 -> P2 = -P2 1870 // If sel == 0 -> P3 = P1 1871 // if zero == 0 -> P3 = P2 1872 // p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int) 1873 #define P3ptr R1 1874 #define P1ptr R2 1875 #define P2ptr R3 1876 #define CPOOL R4 1877 1878 // Temporaries in REGs 1879 #define Y2L V15 1880 #define Y2H V16 1881 #define T1L V17 1882 #define T1H V18 1883 #define T2L V19 1884 #define T2H V20 1885 #define T3L V21 1886 #define T3H V22 1887 #define T4L V23 1888 #define T4H V24 1889 1890 // Temps for Sub and Add 1891 #define TT0 V11 1892 #define TT1 V12 1893 #define T2 V13 1894 1895 // p256MulAsm Parameters 1896 #define X0 V0 1897 #define X1 V1 1898 #define Y0 V2 1899 #define Y1 V3 1900 #define T0 V4 1901 #define T1 V5 1902 1903 #define PL V30 1904 #define PH V31 1905 1906 // Names for zero/sel selects 1907 #define X1L V0 1908 #define X1H V1 1909 #define Y1L V2 // p256MulAsmParmY 1910 #define Y1H V3 // p256MulAsmParmY 1911 #define Z1L V4 1912 #define Z1H V5 1913 #define X2L V0 1914 #define X2H V1 1915 #define Z2L V4 1916 #define Z2H V5 1917 #define X3L V17 // T1L 1918 #define X3H V18 // T1H 1919 #define Y3L V21 // T3L 1920 #define Y3H V22 // T3H 1921 #define Z3L V28 1922 #define Z3H V29 1923 1924 #define ZER V6 1925 #define SEL1 V7 1926 #define CAR1 V8 1927 #define CAR2 V9 1928 /* * 1929 * Three operand formula: 1930 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1931 * T1 = Z1² 1932 * T2 = T1*Z1 1933 * T1 = T1*X2 1934 * T2 = T2*Y2 1935 * T1 = T1-X1 1936 * T2 = T2-Y1 1937 * Z3 = Z1*T1 1938 * T3 = T1² 1939 * T4 = T3*T1 1940 * T3 = T3*X1 1941 * T1 = 2*T3 1942 * X3 = T2² 1943 * X3 = X3-T1 1944 * X3 = X3-T4 1945 * T3 = T3-X3 1946 * T3 = T3*T2 1947 * T4 = T4*Y1 1948 * Y3 = T3-T4 1949 1950 * Three operand formulas, but with MulInternal X,Y used to store temps 1951 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1 1952 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2 1953 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2 1954 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2 1955 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1956 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1957 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2 1958 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2 1959 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4 1960 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4 1961 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1962 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4 1963 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 1964 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1965 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1966 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4 1967 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4 1968 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 1969 1970 */ 1971 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 1972 MOVD P3+0(FP), P3ptr 1973 MOVD P1+8(FP), P1ptr 1974 MOVD P2+16(FP), P2ptr 1975 1976 MOVD $p256mul<>+0x00(SB), CPOOL 1977 VL 16(CPOOL), PL 1978 VL 0(CPOOL), PH 1979 1980 // if (sign == 1) { 1981 // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2 1982 // } 1983 1984 VL 32(P2ptr), Y2H 1985 VL 48(P2ptr), Y2L 1986 1987 VLREPG sign+24(FP), SEL1 1988 VZERO ZER 1989 VCEQG SEL1, ZER, SEL1 1990 1991 VSCBIQ Y2L, PL, CAR1 1992 VSQ Y2L, PL, T1L 1993 VSBIQ PH, Y2H, CAR1, T1H 1994 1995 VSEL Y2L, T1L, SEL1, Y2L 1996 VSEL Y2H, T1H, SEL1, Y2H 1997 1998 /* * 1999 * Three operand formula: 2000 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 2001 */ 2002 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 2003 VL 64(P1ptr), X1 // Z1H 2004 VL 80(P1ptr), X0 // Z1L 2005 VLR X0, Y0 2006 VLR X1, Y1 2007 CALL p256SqrInternal<>(SB) 2008 2009 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 2010 VLR T0, X0 2011 VLR T1, X1 2012 CALL p256MulInternal<>(SB) 2013 VLR T0, T2L 2014 VLR T1, T2H 2015 2016 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 2017 VL 0(P2ptr), Y1 // X2H 2018 VL 16(P2ptr), Y0 // X2L 2019 CALL p256MulInternal<>(SB) 2020 VLR T0, T1L 2021 VLR T1, T1H 2022 2023 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2 2024 VLR T2L, X0 2025 VLR T2H, X1 2026 VLR Y2L, Y0 2027 VLR Y2H, Y1 2028 CALL p256MulInternal<>(SB) 2029 2030 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 2031 VL 32(P1ptr), Y1H 2032 VL 48(P1ptr), Y1L 2033 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L) 2034 2035 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 2036 VL 0(P1ptr), X1H 2037 VL 16(P1ptr), X1L 2038 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L) 2039 2040 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2 2041 VL 64(P1ptr), X1 // Z1H 2042 VL 80(P1ptr), X0 // Z1L 2043 CALL p256MulInternal<>(SB) 2044 2045 // VST T1, 64(P3ptr) 2046 // VST T0, 80(P3ptr) 2047 VLR T0, Z3L 2048 VLR T1, Z3H 2049 2050 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 2051 VLR Y0, X0 2052 VLR Y1, X1 2053 CALL p256SqrInternal<>(SB) 2054 VLR T0, X0 2055 VLR T1, X1 2056 2057 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4 2058 CALL p256MulInternal<>(SB) 2059 VLR T0, T4L 2060 VLR T1, T4H 2061 2062 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 2063 VL 0(P1ptr), Y1 // X1H 2064 VL 16(P1ptr), Y0 // X1L 2065 CALL p256MulInternal<>(SB) 2066 VLR T0, T3L 2067 VLR T1, T3H 2068 2069 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 2070 p256AddInternal(T1H,T1L, T1,T0,T1,T0) 2071 2072 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4 2073 VLR T2L, X0 2074 VLR T2H, X1 2075 VLR T2L, Y0 2076 VLR T2H, Y1 2077 CALL p256SqrInternal<>(SB) 2078 2079 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3) 2080 p256SubInternal(T1,T0,T1,T0,T1H,T1L) 2081 2082 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 2083 p256SubInternal(T1,T0,T1,T0,T4H,T4L) 2084 VLR T0, X3L 2085 VLR T1, X3H 2086 2087 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 2088 p256SubInternal(X1,X0,T3H,T3L,T1,T0) 2089 2090 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4 2091 CALL p256MulInternal<>(SB) 2092 VLR T0, T3L 2093 VLR T1, T3H 2094 2095 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 2096 VLR T4L, X0 2097 VLR T4H, X1 2098 VL 32(P1ptr), Y1 // Y1H 2099 VL 48(P1ptr), Y0 // Y1L 2100 CALL p256MulInternal<>(SB) 2101 2102 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3) 2103 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0) 2104 2105 // if (sel == 0) { 2106 // copy(P3.x[:], X1) 2107 // copy(P3.y[:], Y1) 2108 // copy(P3.z[:], Z1) 2109 // } 2110 2111 VL 0(P1ptr), X1H 2112 VL 16(P1ptr), X1L 2113 2114 // Y1 already loaded, left over from addition 2115 VL 64(P1ptr), Z1H 2116 VL 80(P1ptr), Z1L 2117 2118 VLREPG sel+32(FP), SEL1 2119 VZERO ZER 2120 VCEQG SEL1, ZER, SEL1 2121 2122 VSEL X1L, X3L, SEL1, X3L 2123 VSEL X1H, X3H, SEL1, X3H 2124 VSEL Y1L, Y3L, SEL1, Y3L 2125 VSEL Y1H, Y3H, SEL1, Y3H 2126 VSEL Z1L, Z3L, SEL1, Z3L 2127 VSEL Z1H, Z3H, SEL1, Z3H 2128 2129 // if (zero == 0) { 2130 // copy(P3.x[:], X2) 2131 // copy(P3.y[:], Y2) 2132 // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 2133 // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p 2134 // } 2135 VL 0(P2ptr), X2H 2136 VL 16(P2ptr), X2L 2137 2138 // Y2 already loaded 2139 VL 128(CPOOL), Z2H 2140 VL 144(CPOOL), Z2L 2141 2142 VLREPG zero+40(FP), SEL1 2143 VZERO ZER 2144 VCEQG SEL1, ZER, SEL1 2145 2146 VSEL X2L, X3L, SEL1, X3L 2147 VSEL X2H, X3H, SEL1, X3H 2148 VSEL Y2L, Y3L, SEL1, Y3L 2149 VSEL Y2H, Y3H, SEL1, Y3H 2150 VSEL Z2L, Z3L, SEL1, Z3L 2151 VSEL Z2H, Z3H, SEL1, Z3H 2152 2153 // All done, store out the result!!! 2154 VST X3H, 0(P3ptr) 2155 VST X3L, 16(P3ptr) 2156 VST Y3H, 32(P3ptr) 2157 VST Y3L, 48(P3ptr) 2158 VST Z3H, 64(P3ptr) 2159 VST Z3L, 80(P3ptr) 2160 2161 RET 2162 2163 #undef P3ptr 2164 #undef P1ptr 2165 #undef P2ptr 2166 #undef CPOOL 2167 2168 #undef Y2L 2169 #undef Y2H 2170 #undef T1L 2171 #undef T1H 2172 #undef T2L 2173 #undef T2H 2174 #undef T3L 2175 #undef T3H 2176 #undef T4L 2177 #undef T4H 2178 2179 #undef TT0 2180 #undef TT1 2181 #undef T2 2182 2183 #undef X0 2184 #undef X1 2185 #undef Y0 2186 #undef Y1 2187 #undef T0 2188 #undef T1 2189 2190 #undef PL 2191 #undef PH 2192 2193 #undef X1L 2194 #undef X1H 2195 #undef Y1L 2196 #undef Y1H 2197 #undef Z1L 2198 #undef Z1H 2199 #undef X2L 2200 #undef X2H 2201 #undef Z2L 2202 #undef Z2H 2203 #undef X3L 2204 #undef X3H 2205 #undef Y3L 2206 #undef Y3H 2207 #undef Z3L 2208 #undef Z3H 2209 2210 #undef ZER 2211 #undef SEL1 2212 #undef CAR1 2213 #undef CAR2 2214 2215 // p256PointDoubleAsm(P3, P1 *p256Point) 2216 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl 2217 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html 2218 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html 2219 #define P3ptr R1 2220 #define P1ptr R2 2221 #define CPOOL R4 2222 2223 // Temporaries in REGs 2224 #define X3L V15 2225 #define X3H V16 2226 #define Y3L V17 2227 #define Y3H V18 2228 #define T1L V19 2229 #define T1H V20 2230 #define T2L V21 2231 #define T2H V22 2232 #define T3L V23 2233 #define T3H V24 2234 2235 #define X1L V6 2236 #define X1H V7 2237 #define Y1L V8 2238 #define Y1H V9 2239 #define Z1L V10 2240 #define Z1H V11 2241 2242 // Temps for Sub and Add 2243 #define TT0 V11 2244 #define TT1 V12 2245 #define T2 V13 2246 2247 // p256MulAsm Parameters 2248 #define X0 V0 2249 #define X1 V1 2250 #define Y0 V2 2251 #define Y1 V3 2252 #define T0 V4 2253 #define T1 V5 2254 2255 #define PL V30 2256 #define PH V31 2257 2258 #define Z3L V23 2259 #define Z3H V24 2260 2261 #define ZER V26 2262 #define SEL1 V27 2263 #define CAR1 V28 2264 #define CAR2 V29 2265 /* 2266 * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv 2267 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3. 2268 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 2269 * A = 3(X₁-Z₁²)×(X₁+Z₁²) 2270 * B = 2Y₁ 2271 * Z₃ = B×Z₁ 2272 * C = B² 2273 * D = C×X₁ 2274 * X₃ = A²-2D 2275 * Y₃ = (D-X₃)×A-C²/2 2276 * 2277 * Three-operand formula: 2278 * T1 = Z1² 2279 * T2 = X1-T1 2280 * T1 = X1+T1 2281 * T2 = T2*T1 2282 * T2 = 3*T2 2283 * Y3 = 2*Y1 2284 * Z3 = Y3*Z1 2285 * Y3 = Y3² 2286 * T3 = Y3*X1 2287 * Y3 = Y3² 2288 * Y3 = half*Y3 2289 * X3 = T2² 2290 * T1 = 2*T3 2291 * X3 = X3-T1 2292 * T1 = T3-X3 2293 * T1 = T1*T2 2294 * Y3 = T1-Y3 2295 */ 2296 2297 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0 2298 MOVD P3+0(FP), P3ptr 2299 MOVD P1+8(FP), P1ptr 2300 2301 MOVD $p256mul<>+0x00(SB), CPOOL 2302 VL 16(CPOOL), PL 2303 VL 0(CPOOL), PH 2304 2305 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² 2306 VL 64(P1ptr), X1 // Z1H 2307 VL 80(P1ptr), X0 // Z1L 2308 VLR X0, Y0 2309 VLR X1, Y1 2310 CALL p256SqrInternal<>(SB) 2311 2312 // SUB(X<X1-T) // T2 = X1-T1 2313 VL 0(P1ptr), X1H 2314 VL 16(P1ptr), X1L 2315 p256SubInternal(X1,X0,X1H,X1L,T1,T0) 2316 2317 // ADD(Y<X1+T) // T1 = X1+T1 2318 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0) 2319 2320 // X- ; Y- ; MUL; T- // T2 = T2*T1 2321 CALL p256MulInternal<>(SB) 2322 2323 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2 2324 p256AddInternal(T2H,T2L,T1,T0,T1,T0) 2325 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0) 2326 2327 // ADD(X<Y1+Y1) // Y3 = 2*Y1 2328 VL 32(P1ptr), Y1H 2329 VL 48(P1ptr), Y1L 2330 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L) 2331 2332 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1 2333 VL 64(P1ptr), Y1 // Z1H 2334 VL 80(P1ptr), Y0 // Z1L 2335 CALL p256MulInternal<>(SB) 2336 VST T1, 64(P3ptr) 2337 VST T0, 80(P3ptr) 2338 2339 // X- ; Y=X ; MUL; T- // Y3 = Y3² 2340 VLR X0, Y0 2341 VLR X1, Y1 2342 CALL p256SqrInternal<>(SB) 2343 2344 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 2345 VLR T0, X0 2346 VLR T1, X1 2347 VL 0(P1ptr), Y1 2348 VL 16(P1ptr), Y0 2349 CALL p256MulInternal<>(SB) 2350 VLR T0, T3L 2351 VLR T1, T3H 2352 2353 // X- ; Y=X ; MUL; T- // Y3 = Y3² 2354 VLR X0, Y0 2355 VLR X1, Y1 2356 CALL p256SqrInternal<>(SB) 2357 2358 // HAL(Y3<T) // Y3 = half*Y3 2359 p256HalfInternal(Y3H,Y3L, T1,T0) 2360 2361 // X=T2; Y=T2; MUL; T- // X3 = T2² 2362 VLR T2L, X0 2363 VLR T2H, X1 2364 VLR T2L, Y0 2365 VLR T2H, Y1 2366 CALL p256SqrInternal<>(SB) 2367 2368 // ADD(T1<T3+T3) // T1 = 2*T3 2369 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L) 2370 2371 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1 2372 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L) 2373 VST X3H, 0(P3ptr) 2374 VST X3L, 16(P3ptr) 2375 2376 // SUB(X<T3-X3) // T1 = T3-X3 2377 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L) 2378 2379 // X- ; Y- ; MUL; T- // T1 = T1*T2 2380 CALL p256MulInternal<>(SB) 2381 2382 // SUB(Y3<T-Y3) // Y3 = T1-Y3 2383 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L) 2384 2385 VST Y3H, 32(P3ptr) 2386 VST Y3L, 48(P3ptr) 2387 RET 2388 2389 #undef P3ptr 2390 #undef P1ptr 2391 #undef CPOOL 2392 #undef X3L 2393 #undef X3H 2394 #undef Y3L 2395 #undef Y3H 2396 #undef T1L 2397 #undef T1H 2398 #undef T2L 2399 #undef T2H 2400 #undef T3L 2401 #undef T3H 2402 #undef X1L 2403 #undef X1H 2404 #undef Y1L 2405 #undef Y1H 2406 #undef Z1L 2407 #undef Z1H 2408 #undef TT0 2409 #undef TT1 2410 #undef T2 2411 #undef X0 2412 #undef X1 2413 #undef Y0 2414 #undef Y1 2415 #undef T0 2416 #undef T1 2417 #undef PL 2418 #undef PH 2419 #undef Z3L 2420 #undef Z3H 2421 #undef ZER 2422 #undef SEL1 2423 #undef CAR1 2424 #undef CAR2 2425 2426 // p256PointAddAsm(P3, P1, P2 *p256Point) 2427 #define P3ptr R1 2428 #define P1ptr R2 2429 #define P2ptr R3 2430 #define CPOOL R4 2431 #define ISZERO R5 2432 #define TRUE R6 2433 2434 // Temporaries in REGs 2435 #define T1L V16 2436 #define T1H V17 2437 #define T2L V18 2438 #define T2H V19 2439 #define U1L V20 2440 #define U1H V21 2441 #define S1L V22 2442 #define S1H V23 2443 #define HL V24 2444 #define HH V25 2445 #define RL V26 2446 #define RH V27 2447 2448 // Temps for Sub and Add 2449 #define ZER V6 2450 #define SEL1 V7 2451 #define CAR1 V8 2452 #define CAR2 V9 2453 #define TT0 V11 2454 #define TT1 V12 2455 #define T2 V13 2456 2457 // p256MulAsm Parameters 2458 #define X0 V0 2459 #define X1 V1 2460 #define Y0 V2 2461 #define Y1 V3 2462 #define T0 V4 2463 #define T1 V5 2464 2465 #define PL V30 2466 #define PH V31 2467 /* 2468 * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields" 2469 * 2470 * A = X₁×Z₂² 2471 * B = Y₁×Z₂³ 2472 * C = X₂×Z₁²-A 2473 * D = Y₂×Z₁³-B 2474 * X₃ = D² - 2A×C² - C³ 2475 * Y₃ = D×(A×C² - X₃) - B×C³ 2476 * Z₃ = Z₁×Z₂×C 2477 * 2478 * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2 2479 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R 2480 * 2481 * T1 = Z1*Z1 2482 * T2 = Z2*Z2 2483 * U1 = X1*T2 2484 * H = X2*T1 2485 * H = H-U1 2486 * Z3 = Z1*Z2 2487 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 2488 * 2489 * S1 = Z2*T2 2490 * S1 = Y1*S1 2491 * R = Z1*T1 2492 * R = Y2*R 2493 * R = R-S1 2494 * 2495 * T1 = H*H 2496 * T2 = H*T1 2497 * U1 = U1*T1 2498 * 2499 * X3 = R*R 2500 * X3 = X3-T2 2501 * T1 = 2*U1 2502 * X3 = X3-T1 << store-out X3 result reg 2503 * 2504 * T2 = S1*T2 2505 * Y3 = U1-X3 2506 * Y3 = R*Y3 2507 * Y3 = Y3-T2 << store-out Y3 result reg 2508 2509 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 2510 // X- ; Y=T ; MUL; R=T // R = Z1*T1 2511 // X=X2; Y- ; MUL; H=T // H = X2*T1 2512 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2513 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2514 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2515 // SUB(H<H-T) // H = H-U1 2516 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2517 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 2518 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2519 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2520 // SUB(R<T-S1) // R = R-S1 2521 // X=H ; Y=H ; MUL; T- // T1 = H*H 2522 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2523 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2524 // X=R ; Y=R ; MUL; T- // X3 = R*R 2525 // SUB(T<T-T2) // X3 = X3-T2 2526 // ADD(X<U1+U1) // T1 = 2*U1 2527 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2528 // SUB(Y<U1-T) // Y3 = U1-X3 2529 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2530 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2531 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2532 */ 2533 TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 2534 MOVD P3+0(FP), P3ptr 2535 MOVD P1+8(FP), P1ptr 2536 MOVD P2+16(FP), P2ptr 2537 2538 MOVD $p256mul<>+0x00(SB), CPOOL 2539 VL 16(CPOOL), PL 2540 VL 0(CPOOL), PH 2541 2542 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 2543 VL 64(P1ptr), X1 // Z1H 2544 VL 80(P1ptr), X0 // Z1L 2545 VLR X0, Y0 2546 VLR X1, Y1 2547 CALL p256SqrInternal<>(SB) 2548 2549 // X- ; Y=T ; MUL; R=T // R = Z1*T1 2550 VLR T0, Y0 2551 VLR T1, Y1 2552 CALL p256MulInternal<>(SB) 2553 VLR T0, RL 2554 VLR T1, RH 2555 2556 // X=X2; Y- ; MUL; H=T // H = X2*T1 2557 VL 0(P2ptr), X1 // X2H 2558 VL 16(P2ptr), X0 // X2L 2559 CALL p256MulInternal<>(SB) 2560 VLR T0, HL 2561 VLR T1, HH 2562 2563 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2564 VL 64(P2ptr), X1 // Z2H 2565 VL 80(P2ptr), X0 // Z2L 2566 VLR X0, Y0 2567 VLR X1, Y1 2568 CALL p256SqrInternal<>(SB) 2569 2570 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2571 VLR T0, Y0 2572 VLR T1, Y1 2573 CALL p256MulInternal<>(SB) 2574 VLR T0, S1L 2575 VLR T1, S1H 2576 2577 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2578 VL 0(P1ptr), X1 // X1H 2579 VL 16(P1ptr), X0 // X1L 2580 CALL p256MulInternal<>(SB) 2581 VLR T0, U1L 2582 VLR T1, U1H 2583 2584 // SUB(H<H-T) // H = H-U1 2585 p256SubInternal(HH,HL,HH,HL,T1,T0) 2586 2587 // if H == 0 or H^P == 0 then ret=1 else ret=0 2588 // clobbers T1H and T1L 2589 MOVD $0, ISZERO 2590 MOVD $1, TRUE 2591 VZERO ZER 2592 VO HL, HH, T1H 2593 VCEQGS ZER, T1H, T1H 2594 MOVDEQ TRUE, ISZERO 2595 VX HL, PL, T1L 2596 VX HH, PH, T1H 2597 VO T1L, T1H, T1H 2598 VCEQGS ZER, T1H, T1H 2599 MOVDEQ TRUE, ISZERO 2600 MOVD ISZERO, ret+24(FP) 2601 2602 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2603 VL 64(P1ptr), X1 // Z1H 2604 VL 80(P1ptr), X0 // Z1L 2605 VL 64(P2ptr), Y1 // Z2H 2606 VL 80(P2ptr), Y0 // Z2L 2607 CALL p256MulInternal<>(SB) 2608 2609 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H 2610 VLR T0, X0 2611 VLR T1, X1 2612 VLR HL, Y0 2613 VLR HH, Y1 2614 CALL p256MulInternal<>(SB) 2615 VST T1, 64(P3ptr) 2616 VST T0, 80(P3ptr) 2617 2618 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2619 VL 32(P1ptr), X1 2620 VL 48(P1ptr), X0 2621 VLR S1L, Y0 2622 VLR S1H, Y1 2623 CALL p256MulInternal<>(SB) 2624 VLR T0, S1L 2625 VLR T1, S1H 2626 2627 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2628 VL 32(P2ptr), X1 2629 VL 48(P2ptr), X0 2630 VLR RL, Y0 2631 VLR RH, Y1 2632 CALL p256MulInternal<>(SB) 2633 2634 // SUB(R<T-S1) // R = T-S1 2635 p256SubInternal(RH,RL,T1,T0,S1H,S1L) 2636 2637 // if R == 0 or R^P == 0 then ret=ret else ret=0 2638 // clobbers T1H and T1L 2639 MOVD $0, ISZERO 2640 MOVD $1, TRUE 2641 VZERO ZER 2642 VO RL, RH, T1H 2643 VCEQGS ZER, T1H, T1H 2644 MOVDEQ TRUE, ISZERO 2645 VX RL, PL, T1L 2646 VX RH, PH, T1H 2647 VO T1L, T1H, T1H 2648 VCEQGS ZER, T1H, T1H 2649 MOVDEQ TRUE, ISZERO 2650 AND ret+24(FP), ISZERO 2651 MOVD ISZERO, ret+24(FP) 2652 2653 // X=H ; Y=H ; MUL; T- // T1 = H*H 2654 VLR HL, X0 2655 VLR HH, X1 2656 VLR HL, Y0 2657 VLR HH, Y1 2658 CALL p256SqrInternal<>(SB) 2659 2660 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2661 VLR T0, Y0 2662 VLR T1, Y1 2663 CALL p256MulInternal<>(SB) 2664 VLR T0, T2L 2665 VLR T1, T2H 2666 2667 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2668 VLR U1L, X0 2669 VLR U1H, X1 2670 CALL p256MulInternal<>(SB) 2671 VLR T0, U1L 2672 VLR T1, U1H 2673 2674 // X=R ; Y=R ; MUL; T- // X3 = R*R 2675 VLR RL, X0 2676 VLR RH, X1 2677 VLR RL, Y0 2678 VLR RH, Y1 2679 CALL p256SqrInternal<>(SB) 2680 2681 // SUB(T<T-T2) // X3 = X3-T2 2682 p256SubInternal(T1,T0,T1,T0,T2H,T2L) 2683 2684 // ADD(X<U1+U1) // T1 = 2*U1 2685 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L) 2686 2687 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2688 p256SubInternal(T1,T0,T1,T0,X1,X0) 2689 VST T1, 0(P3ptr) 2690 VST T0, 16(P3ptr) 2691 2692 // SUB(Y<U1-T) // Y3 = U1-X3 2693 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0) 2694 2695 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2696 VLR RL, X0 2697 VLR RH, X1 2698 CALL p256MulInternal<>(SB) 2699 VLR T0, U1L 2700 VLR T1, U1H 2701 2702 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2703 VLR S1L, X0 2704 VLR S1H, X1 2705 VLR T2L, Y0 2706 VLR T2H, Y1 2707 CALL p256MulInternal<>(SB) 2708 2709 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2710 p256SubInternal(T1,T0,U1H,U1L,T1,T0) 2711 VST T1, 32(P3ptr) 2712 VST T0, 48(P3ptr) 2713 2714 RET