github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/p256_asm_ppc64le.s (about) 1 // This is a port of the NIST P256 ppc64le asm implementation to SM2 P256. 2 // 3 // Copyright 2019 The Go Authors. All rights reserved. 4 // Use of this source code is governed by a BSD-style 5 // license that can be found in the LICENSE file. 6 7 //go:build !purego 8 9 #include "textflag.h" 10 11 // This is a port of the s390x asm implementation. 12 // to ppc64le. 13 14 // Some changes were needed due to differences in 15 // the Go opcodes and/or available instructions 16 // between s390x and ppc64le. 17 18 // 1. There were operand order differences in the 19 // VSUBUQM, VSUBCUQ, and VSEL instructions. 20 21 // 2. ppc64 does not have a multiply high and low 22 // like s390x, so those were implemented using 23 // macros to compute the equivalent values. 24 25 // 3. The LVX, STVX instructions on ppc64 require 26 // 16 byte alignment of the data. To avoid that 27 // requirement, data is loaded using LXVD2X and 28 // STXVD2X with VPERM to reorder bytes correctly. 29 30 // I have identified some areas where I believe 31 // changes would be needed to make this work for big 32 // endian; however additional changes beyond what I 33 // have noted are most likely needed to make it work. 34 // - The string used with VPERM to swap the byte order 35 // for loads and stores. 36 // - The constants that are loaded from CPOOL. 37 // 38 39 // The following constants are defined in an order 40 // that is correct for use with LXVD2X/STXVD2X 41 // on little endian. 42 DATA p256ord<>+0x00(SB)/8, $0xfffffffeffffffff 43 DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff 44 DATA p256ord<>+0x10(SB)/8, $0x7203df6b21c6052b 45 DATA p256ord<>+0x18(SB)/8, $0x53bbf40939d54123 46 DATA p256ord<>+0x20(SB)/8, $0x7235097572350975 // p256ord K0 47 DATA p256ord<>+0x28(SB)/8, $0x7235097572350975 // p256ord K0 48 DATA p256<>+0x00(SB)/8, $0xfffffffeffffffff // P256 49 DATA p256<>+0x08(SB)/8, $0xffffffffffffffff // P256 50 DATA p256<>+0x10(SB)/8, $0xffffffff00000000 // P256 51 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256 52 DATA p256<>+0x20(SB)/8, $0x0000000000000000 // SEL 0 0 d1 d0 53 DATA p256<>+0x28(SB)/8, $0x18191a1b1c1d1e1f // SEL 0 0 d1 d0 54 DATA p256mul<>+0x00(SB)/8, $0xffffffff00000000 // P256 original 55 DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256 56 DATA p256mul<>+0x10(SB)/8, $0xfffffffeffffffff // P256 original 57 DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256 58 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0 59 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0 60 DATA p256mul<>+0x30(SB)/8, $0x0405060708090a0b // SEL 0 0 d1 d0 61 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 62 DATA p256mul<>+0x40(SB)/8, $0x00000000ffffffff // (1*2^256)%P256 63 DATA p256mul<>+0x48(SB)/8, $0x0000000000000001 // (1*2^256)%P256 64 DATA p256mul<>+0x50(SB)/8, $0x0000000100000000 // (1*2^256)%P256 65 DATA p256mul<>+0x58(SB)/8, $0x0000000000000000 // (1*2^256)%P256 66 67 // External declarations for constants 68 GLOBL p256ord<>(SB), 8, $48 69 GLOBL p256<>(SB), 8, $48 70 GLOBL p256mul<>(SB), 8, $96 71 72 // The following macros are used to implement the ppc64le 73 // equivalent function from the corresponding s390x 74 // instruction for vector multiply high, low, and add, 75 // since there aren't exact equivalent instructions. 76 // The corresponding s390x instructions appear in the 77 // comments. 78 // Implementation for big endian would have to be 79 // investigated, I think it would be different. 80 // 81 // 82 // Vector multiply word 83 // 84 // VMLF x0, x1, out_low 85 // VMLHF x0, x1, out_hi 86 #define VMULT(x1, x2, out_low, out_hi) \ 87 VMULEUW x1, x2, TMP1; \ 88 VMULOUW x1, x2, TMP2; \ 89 VMRGEW TMP1, TMP2, out_hi; \ 90 VMRGOW TMP1, TMP2, out_low 91 92 // 93 // Vector multiply add word 94 // 95 // VMALF x0, x1, y, out_low 96 // VMALHF x0, x1, y, out_hi 97 #define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \ 98 VMULEUW y, one, TMP1; \ 99 VMULOUW y, one, TMP2; \ 100 VMULEUW x1, x2, out_hi; \ 101 VMULOUW x1, x2, out_low; \ 102 VADDUDM TMP1, out_hi, TMP1; \ 103 VADDUDM TMP2, out_low, TMP2; \ 104 VMRGEW TMP1, TMP2, out_hi; \ 105 VMRGOW TMP1, TMP2, out_low 106 107 #define res_ptr R3 108 #define a_ptr R4 109 110 #undef res_ptr 111 #undef a_ptr 112 113 #define P1ptr R3 114 #define CPOOL R7 115 116 #define Y1L V0 117 #define Y1H V1 118 #define T1L V2 119 #define T1H V3 120 121 #define PL V30 122 #define PH V31 123 124 #define SEL V4 125 #define ZER V5 126 #define CAR1 V6 127 // func p256NegCond(val *p256Element, cond int) 128 TEXT ·p256NegCond(SB), NOSPLIT, $0-16 129 MOVD val+0(FP), P1ptr 130 MOVD $16, R16 131 MOVD $40, R17 132 133 // cond is R1 + 8 (cond offset) + 32 134 LXVDSX (R1)(R17), SEL 135 VSPLTISB $0, ZER 136 // SEL controls whether to store a or b 137 VCMPEQUD SEL, ZER, SEL 138 139 MOVD $p256mul<>+0x00(SB), CPOOL 140 141 LXVD2X (P1ptr)(R0), Y1L 142 LXVD2X (P1ptr)(R16), Y1H 143 144 XXPERMDI Y1H, Y1H, $2, Y1H 145 XXPERMDI Y1L, Y1L, $2, Y1L 146 147 LXVD2X (CPOOL)(R0), PL 148 LXVD2X (CPOOL)(R16), PH 149 150 VSUBCUQ PL, Y1L, CAR1 // subtract part2 giving carry 151 VSUBUQM PL, Y1L, T1L // subtract part2 giving result 152 VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2 153 154 VSEL T1H, Y1H, SEL, Y1H 155 VSEL T1L, Y1L, SEL, Y1L 156 157 XXPERMDI Y1H, Y1H, $2, Y1H 158 XXPERMDI Y1L, Y1L, $2, Y1L 159 160 STXVD2X Y1L, (R0+P1ptr) 161 STXVD2X Y1H, (R16+P1ptr) 162 RET 163 164 #undef P1ptr 165 #undef CPOOL 166 #undef Y1L 167 #undef Y1H 168 #undef T1L 169 #undef T1H 170 #undef PL 171 #undef PH 172 #undef ZER 173 #undef SEL 174 #undef CAR1 175 176 #define P3ptr R3 177 #define P1ptr R4 178 #define P2ptr R5 179 180 #define X1L V0 181 #define X1H V1 182 #define Y1L V2 183 #define Y1H V3 184 #define Z1L V4 185 #define Z1H V5 186 #define X2L V6 187 #define X2H V7 188 #define Y2L V8 189 #define Y2H V9 190 #define Z2L V10 191 #define Z2H V11 192 #define SEL V12 193 #define ZER V13 194 195 // This function uses LXVD2X and STXVD2X to avoid the 196 // data alignment requirement for LVX, STVX. Since 197 // this code is just moving bytes and not doing arithmetic, 198 // order of the bytes doesn't matter. 199 // 200 // func p256MovCond(res, a, b *p256Point, cond int) 201 TEXT ·p256MovCond(SB), NOSPLIT, $0-32 202 MOVD res+0(FP), P3ptr 203 MOVD a+8(FP), P1ptr 204 MOVD b+16(FP), P2ptr 205 MOVD $16, R16 206 MOVD $32, R17 207 MOVD $48, R18 208 MOVD $56, R21 209 MOVD $64, R19 210 MOVD $80, R20 211 // cond is R1 + 24 (cond offset) + 32 212 LXVDSX (R1)(R21), SEL 213 VSPLTISB $0, ZER 214 // SEL controls whether to store a or b 215 VCMPEQUD SEL, ZER, SEL 216 217 LXVD2X (P1ptr+R0), X1H 218 LXVD2X (P1ptr+R16), X1L 219 LXVD2X (P1ptr+R17), Y1H 220 LXVD2X (P1ptr+R18), Y1L 221 LXVD2X (P1ptr+R19), Z1H 222 LXVD2X (P1ptr+R20), Z1L 223 224 LXVD2X (P2ptr+R0), X2H 225 LXVD2X (P2ptr+R16), X2L 226 LXVD2X (P2ptr+R17), Y2H 227 LXVD2X (P2ptr+R18), Y2L 228 LXVD2X (P2ptr+R19), Z2H 229 LXVD2X (P2ptr+R20), Z2L 230 231 VSEL X1H, X2H, SEL, X1H 232 VSEL X1L, X2L, SEL, X1L 233 VSEL Y1H, Y2H, SEL, Y1H 234 VSEL Y1L, Y2L, SEL, Y1L 235 VSEL Z1H, Z2H, SEL, Z1H 236 VSEL Z1L, Z2L, SEL, Z1L 237 238 STXVD2X X1H, (P3ptr+R0) 239 STXVD2X X1L, (P3ptr+R16) 240 STXVD2X Y1H, (P3ptr+R17) 241 STXVD2X Y1L, (P3ptr+R18) 242 STXVD2X Z1H, (P3ptr+R19) 243 STXVD2X Z1L, (P3ptr+R20) 244 245 RET 246 247 #undef P3ptr 248 #undef P1ptr 249 #undef P2ptr 250 #undef X1L 251 #undef X1H 252 #undef Y1L 253 #undef Y1H 254 #undef Z1L 255 #undef Z1H 256 #undef X2L 257 #undef X2H 258 #undef Y2L 259 #undef Y2H 260 #undef Z2L 261 #undef Z2H 262 #undef SEL 263 #undef ZER 264 265 #define P3ptr R3 266 #define P1ptr R4 267 #define COUNT R5 268 269 #define X1L V0 270 #define X1H V1 271 #define Y1L V2 272 #define Y1H V3 273 #define Z1L V4 274 #define Z1H V5 275 #define X2L V6 276 #define X2H V7 277 #define Y2L V8 278 #define Y2H V9 279 #define Z2L V10 280 #define Z2H V11 281 282 #define ONE V18 283 #define IDX V19 284 #define SEL1 V20 285 #define SEL2 V21 286 // func p256Select(point *p256Point, table *p256Table, idx int, limit int) 287 TEXT ·p256Select(SB), NOSPLIT, $0-24 288 MOVD res+0(FP), P3ptr 289 MOVD table+8(FP), P1ptr 290 MOVD limit+24(FP), COUNT 291 MOVD $16, R16 292 MOVD $32, R17 293 MOVD $48, R18 294 MOVD $64, R19 295 MOVD $80, R20 296 297 LXVDSX (R1)(R18), SEL1 // VLREPG idx+32(FP), SEL1 298 VSPLTB $7, SEL1, IDX // splat byte 299 VSPLTISB $1, ONE // VREPIB $1, ONE 300 VSPLTISB $1, SEL2 // VREPIB $1, SEL2 301 MOVD COUNT, CTR // set up ctr 302 303 VSPLTISB $0, X1H // VZERO X1H 304 VSPLTISB $0, X1L // VZERO X1L 305 VSPLTISB $0, Y1H // VZERO Y1H 306 VSPLTISB $0, Y1L // VZERO Y1L 307 VSPLTISB $0, Z1H // VZERO Z1H 308 VSPLTISB $0, Z1L // VZERO Z1L 309 310 loop_select: 311 312 // LVXD2X is used here since data alignment doesn't 313 // matter. 314 315 LXVD2X (P1ptr+R0), X2H 316 LXVD2X (P1ptr+R16), X2L 317 LXVD2X (P1ptr+R17), Y2H 318 LXVD2X (P1ptr+R18), Y2L 319 LXVD2X (P1ptr+R19), Z2H 320 LXVD2X (P1ptr+R20), Z2L 321 322 VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK 323 324 // This will result in SEL1 being all 0s or 1s, meaning 325 // the result is either X1L or X2L, no individual byte 326 // selection. 327 328 VSEL X1L, X2L, SEL1, X1L 329 VSEL X1H, X2H, SEL1, X1H 330 VSEL Y1L, Y2L, SEL1, Y1L 331 VSEL Y1H, Y2H, SEL1, Y1H 332 VSEL Z1L, Z2L, SEL1, Z1L 333 VSEL Z1H, Z2H, SEL1, Z1H 334 335 // Add 1 to all bytes in SEL2 336 VADDUBM SEL2, ONE, SEL2 // VAB SEL2, ONE, SEL2 OK 337 ADD $96, P1ptr 338 BDNZ loop_select 339 340 // STXVD2X is used here so that alignment doesn't 341 // need to be verified. Since values were loaded 342 // using LXVD2X this is OK. 343 STXVD2X X1H, (P3ptr+R0) 344 STXVD2X X1L, (P3ptr+R16) 345 STXVD2X Y1H, (P3ptr+R17) 346 STXVD2X Y1L, (P3ptr+R18) 347 STXVD2X Z1H, (P3ptr+R19) 348 STXVD2X Z1L, (P3ptr+R20) 349 RET 350 351 #undef P3ptr 352 #undef P1ptr 353 #undef COUNT 354 #undef X1L 355 #undef X1H 356 #undef Y1L 357 #undef Y1H 358 #undef Z1L 359 #undef Z1H 360 #undef X2L 361 #undef X2H 362 #undef Y2L 363 #undef Y2H 364 #undef Z2L 365 #undef Z2H 366 #undef ONE 367 #undef IDX 368 #undef SEL1 369 #undef SEL2 370 371 // The following functions all reverse the byte order. 372 373 //func p256BigToLittle(res *p256Element, in *[32]byte) 374 TEXT ·p256BigToLittle(SB), NOSPLIT, $0-16 375 MOVD res+0(FP), R3 376 MOVD in+8(FP), R4 377 BR p256InternalEndianSwap<>(SB) 378 379 //func p256LittleToBig(res *[32]byte, in *p256Element) 380 TEXT ·p256LittleToBig(SB), NOSPLIT, $0-16 381 MOVD res+0(FP), R3 382 MOVD in+8(FP), R4 383 BR p256InternalEndianSwap<>(SB) 384 385 //func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) 386 TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0-16 387 MOVD res+0(FP), R3 388 MOVD in+8(FP), R4 389 BR p256InternalEndianSwap<>(SB) 390 391 //func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) 392 TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0-16 393 MOVD res+0(FP), R3 394 MOVD in+8(FP), R4 395 BR p256InternalEndianSwap<>(SB) 396 397 TEXT p256InternalEndianSwap<>(SB), NOSPLIT, $0-0 398 // Index registers needed for BR movs 399 MOVD $8, R9 400 MOVD $16, R10 401 MOVD $24, R14 402 403 MOVDBR (R0)(R4), R5 404 MOVDBR (R9)(R4), R6 405 MOVDBR (R10)(R4), R7 406 MOVDBR (R14)(R4), R8 407 408 MOVD R8, 0(R3) 409 MOVD R7, 8(R3) 410 MOVD R6, 16(R3) 411 MOVD R5, 24(R3) 412 413 RET 414 415 #define P3ptr R3 416 #define P1ptr R4 417 #define COUNT R5 418 419 #define X1L V0 420 #define X1H V1 421 #define Y1L V2 422 #define Y1H V3 423 #define Z1L V4 424 #define Z1H V5 425 #define X2L V6 426 #define X2H V7 427 #define Y2L V8 428 #define Y2H V9 429 #define Z2L V10 430 #define Z2H V11 431 432 #define ONE V18 433 #define IDX V19 434 #define SEL1 V20 435 #define SEL2 V21 436 437 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) 438 TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24 439 MOVD res+0(FP), P3ptr 440 MOVD table+8(FP), P1ptr 441 MOVD $16, R16 442 MOVD $32, R17 443 MOVD $48, R18 444 445 LXVDSX (R1)(R18), SEL1 446 VSPLTB $7, SEL1, IDX // splat byte 447 448 VSPLTISB $1, ONE // Vector with byte 1s 449 VSPLTISB $1, SEL2 // Vector with byte 1s 450 MOVD $32, COUNT 451 MOVD COUNT, CTR // loop count 452 453 VSPLTISB $0, X1H // VZERO X1H 454 VSPLTISB $0, X1L // VZERO X1L 455 VSPLTISB $0, Y1H // VZERO Y1H 456 VSPLTISB $0, Y1L // VZERO Y1L 457 458 loop_select: 459 LXVD2X (P1ptr+R0), X2H 460 LXVD2X (P1ptr+R16), X2L 461 LXVD2X (P1ptr+R17), Y2H 462 LXVD2X (P1ptr+R18), Y2L 463 464 VCMPEQUD SEL2, IDX, SEL1 // Compare against idx 465 466 VSEL X1L, X2L, SEL1, X1L // Select if idx matched 467 VSEL X1H, X2H, SEL1, X1H 468 VSEL Y1L, Y2L, SEL1, Y1L 469 VSEL Y1H, Y2H, SEL1, Y1H 470 471 VADDUBM SEL2, ONE, SEL2 // Increment SEL2 bytes by 1 472 ADD $64, P1ptr // Next chunk 473 BDNZ loop_select 474 475 STXVD2X X1H, (P3ptr+R0) 476 STXVD2X X1L, (P3ptr+R16) 477 STXVD2X Y1H, (P3ptr+R17) 478 STXVD2X Y1L, (P3ptr+R18) 479 RET 480 481 #undef P3ptr 482 #undef P1ptr 483 #undef COUNT 484 #undef X1L 485 #undef X1H 486 #undef Y1L 487 #undef Y1H 488 #undef Z1L 489 #undef Z1H 490 #undef X2L 491 #undef X2H 492 #undef Y2L 493 #undef Y2H 494 #undef Z2L 495 #undef Z2H 496 #undef ONE 497 #undef IDX 498 #undef SEL1 499 #undef SEL2 500 501 // --------------------------------------- 502 // sm2p256OrdMulInternal 503 #define X0 V0 504 #define X1 V1 505 #define Y0 V2 506 #define Y1 V3 507 #define M1 V4 508 #define M0 V5 509 #define T0 V6 510 #define T1 V7 511 #define T2 V8 512 #define YDIG V9 513 514 #define ADD1 V16 515 #define ADD1H V17 516 #define ADD2 V18 517 #define ADD2H V19 518 #define RED1 V20 519 #define RED1H V21 520 #define RED2 V22 521 #define RED2H V23 522 #define CAR1 V24 523 #define CAR1M V25 524 525 #define MK0 V30 526 #define K0 V31 527 528 // TMP1, TMP2 used in 529 // VMULT macros 530 #define TMP1 V13 531 #define TMP2 V27 532 #define ONE V29 // 1s splatted by word 533 534 TEXT sm2p256OrdMulInternal<>(SB), NOSPLIT, $0 535 // ---------------------------------------------------------------------------/ 536 // VREPF $3, Y0, YDIG 537 VSPLTW $3, Y0, YDIG 538 VSPLTISW $1, ONE 539 540 // VMLF X0, YDIG, ADD1 541 // VMLF X1, YDIG, ADD2 542 // VMLHF X0, YDIG, ADD1H 543 // VMLHF X1, YDIG, ADD2H 544 VMULT(X0, YDIG, ADD1, ADD1H) 545 VMULT(X1, YDIG, ADD2, ADD2H) 546 547 // VMLF ADD1, K0, MK0 548 // VREPF $3, MK0, MK0 549 VMULUWM ADD1, K0, MK0 550 VSPLTW $3, MK0, MK0 551 552 // VMALF M0, MK0, ADD1, RED1 553 // VMALHF M0, MK0, ADD1, RED1H 554 // VMALF M1, MK0, ADD2, RED2 555 // VMALHF M1, MK0, ADD2, RED2H 556 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 557 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 558 559 VSPLTISB $0, T2 // VZERO T2 560 561 VSLDOI $12, RED2, RED1, RED1 // VSLDB 562 VSLDOI $12, T2, RED2, RED2 // VSLDB 563 564 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 565 VADDUQM RED1, ADD1H, T0 // VAQ 566 VADDCUQ RED1H, T0, CAR1M // VACCQ 567 VADDUQM RED1H, T0, T0 // VAQ 568 569 // << ready for next MK0 570 571 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 572 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 573 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 574 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 575 VADDUQM CAR1, T2, T2 // VAQ 576 577 // --------------------------------------------------- 578 /* * 579 * ---+--------+--------+ 580 * T2| T1 | T0 | 581 * ---+--------+--------+ 582 * *(add)* 583 * +--------+--------+ 584 * | X1 | X0 | 585 * +--------+--------+ 586 * *(mul)* 587 * +--------+--------+ 588 * | YDIG | YDIG | 589 * +--------+--------+ 590 * *(add)* 591 * +--------+--------+ 592 * | M1 | M0 | 593 * +--------+--------+ 594 * *(mul)* 595 * +--------+--------+ 596 * | MK0 | MK0 | 597 * +--------+--------+ 598 * 599 * --------------------- 600 * 601 * +--------+--------+ 602 * | ADD2 | ADD1 | 603 * +--------+--------+ 604 * +--------+--------+ 605 * | ADD2H | ADD1H | 606 * +--------+--------+ 607 * +--------+--------+ 608 * | RED2 | RED1 | 609 * +--------+--------+ 610 * +--------+--------+ 611 * | RED2H | RED1H | 612 * +--------+--------+ 613 */ 614 // VREPF $2, Y0, YDIG 615 VSPLTW $2, Y0, YDIG 616 617 // VMALF X0, YDIG, T0, ADD1 618 // VMALF X1, YDIG, T1, ADD2 619 // VMALHF X0, YDIG, T0, ADD1H 620 // VMALHF X1, YDIG, T1, ADD2H 621 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 622 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 623 624 // VMLF ADD1, K0, MK0 625 // VREPF $3, MK0, MK0 626 VMULUWM ADD1, K0, MK0 627 VSPLTW $3, MK0, MK0 628 629 // VMALF M0, MK0, ADD1, RED1 630 // VMALHF M0, MK0, ADD1, RED1H 631 // VMALF M1, MK0, ADD2, RED2 632 // VMALHF M1, MK0, ADD2, RED2H 633 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 634 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 635 636 VSLDOI $12, RED2, RED1, RED1 // VSLDB 637 VSLDOI $12, T2, RED2, RED2 // VSLDB 638 639 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 640 VADDUQM RED1, ADD1H, T0 // VAQ 641 VADDCUQ RED1H, T0, CAR1M // VACCQ 642 VADDUQM RED1H, T0, T0 // VAQ 643 644 // << ready for next MK0 645 646 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 647 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 648 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 649 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 650 VADDUQM CAR1, T2, T2 // VAQ 651 652 // --------------------------------------------------- 653 // VREPF $1, Y0, YDIG 654 VSPLTW $1, Y0, YDIG 655 656 // VMALF X0, YDIG, T0, ADD1 657 // VMALF X1, YDIG, T1, ADD2 658 // VMALHF X0, YDIG, T0, ADD1H 659 // VMALHF X1, YDIG, T1, ADD2H 660 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 661 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 662 663 // VMLF ADD1, K0, MK0 664 // VREPF $3, MK0, MK0 665 VMULUWM ADD1, K0, MK0 666 VSPLTW $3, MK0, MK0 667 668 // VMALF M0, MK0, ADD1, RED1 669 // VMALHF M0, MK0, ADD1, RED1H 670 // VMALF M1, MK0, ADD2, RED2 671 // VMALHF M1, MK0, ADD2, RED2H 672 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 673 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 674 675 VSLDOI $12, RED2, RED1, RED1 // VSLDB 676 VSLDOI $12, T2, RED2, RED2 // VSLDB 677 678 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 679 VADDUQM RED1, ADD1H, T0 // VAQ 680 VADDCUQ RED1H, T0, CAR1M // VACCQ 681 VADDUQM RED1H, T0, T0 // VAQ 682 683 // << ready for next MK0 684 685 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 686 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 687 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 688 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 689 VADDUQM CAR1, T2, T2 // VAQ 690 691 // --------------------------------------------------- 692 // VREPF $0, Y0, YDIG 693 VSPLTW $0, Y0, YDIG 694 695 // VMALF X0, YDIG, T0, ADD1 696 // VMALF X1, YDIG, T1, ADD2 697 // VMALHF X0, YDIG, T0, ADD1H 698 // VMALHF X1, YDIG, T1, ADD2H 699 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 700 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 701 702 // VMLF ADD1, K0, MK0 703 // VREPF $3, MK0, MK0 704 VMULUWM ADD1, K0, MK0 705 VSPLTW $3, MK0, MK0 706 707 // VMALF M0, MK0, ADD1, RED1 708 // VMALHF M0, MK0, ADD1, RED1H 709 // VMALF M1, MK0, ADD2, RED2 710 // VMALHF M1, MK0, ADD2, RED2H 711 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 712 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 713 714 VSLDOI $12, RED2, RED1, RED1 715 VSLDOI $12, T2, RED2, RED2 716 717 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 718 VADDUQM RED1, ADD1H, T0 // VAQ 719 VADDCUQ RED1H, T0, CAR1M // VACCQ 720 VADDUQM RED1H, T0, T0 // VAQ 721 722 // << ready for next MK0 723 724 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 725 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 726 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 727 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 728 VADDUQM CAR1, T2, T2 // VAQ 729 730 // --------------------------------------------------- 731 // VREPF $3, Y1, YDIG 732 VSPLTW $3, Y1, YDIG 733 734 // VMALF X0, YDIG, T0, ADD1 735 // VMALF X1, YDIG, T1, ADD2 736 // VMALHF X0, YDIG, T0, ADD1H 737 // VMALHF X1, YDIG, T1, ADD2H 738 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 739 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 740 741 // VMLF ADD1, K0, MK0 742 // VREPF $3, MK0, MK0 743 VMULUWM ADD1, K0, MK0 744 VSPLTW $3, MK0, MK0 745 746 // VMALF M0, MK0, ADD1, RED1 747 // VMALHF M0, MK0, ADD1, RED1H 748 // VMALF M1, MK0, ADD2, RED2 749 // VMALHF M1, MK0, ADD2, RED2H 750 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 751 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 752 753 VSLDOI $12, RED2, RED1, RED1 754 VSLDOI $12, T2, RED2, RED2 755 756 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 757 VADDUQM RED1, ADD1H, T0 // VAQ 758 VADDCUQ RED1H, T0, CAR1M // VACCQ 759 VADDUQM RED1H, T0, T0 // VAQ 760 761 // << ready for next MK0 762 763 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 764 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 765 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 766 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 767 VADDUQM CAR1, T2, T2 // VAQ 768 769 // --------------------------------------------------- 770 // VREPF $2, Y1, YDIG 771 VSPLTW $2, Y1, YDIG 772 773 // VMALF X0, YDIG, T0, ADD1 774 // VMALF X1, YDIG, T1, ADD2 775 // VMALHF X0, YDIG, T0, ADD1H 776 // VMALHF X1, YDIG, T1, ADD2H 777 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 778 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 779 780 // VMLF ADD1, K0, MK0 781 // VREPF $3, MK0, MK0 782 VMULUWM ADD1, K0, MK0 783 VSPLTW $3, MK0, MK0 784 785 // VMALF M0, MK0, ADD1, RED1 786 // VMALHF M0, MK0, ADD1, RED1H 787 // VMALF M1, MK0, ADD2, RED2 788 // VMALHF M1, MK0, ADD2, RED2H 789 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 790 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 791 792 VSLDOI $12, RED2, RED1, RED1 793 VSLDOI $12, T2, RED2, RED2 794 795 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 796 VADDUQM RED1, ADD1H, T0 // VAQ 797 VADDCUQ RED1H, T0, CAR1M // VACCQ 798 VADDUQM RED1H, T0, T0 // VAQ 799 800 // << ready for next MK0 801 802 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 803 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 804 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 805 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 806 VADDUQM CAR1, T2, T2 // VAQ 807 808 // --------------------------------------------------- 809 // VREPF $1, Y1, YDIG 810 VSPLTW $1, Y1, YDIG 811 812 // VMALF X0, YDIG, T0, ADD1 813 // VMALF X1, YDIG, T1, ADD2 814 // VMALHF X0, YDIG, T0, ADD1H 815 // VMALHF X1, YDIG, T1, ADD2H 816 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 817 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 818 819 // VMLF ADD1, K0, MK0 820 // VREPF $3, MK0, MK0 821 VMULUWM ADD1, K0, MK0 822 VSPLTW $3, MK0, MK0 823 824 // VMALF M0, MK0, ADD1, RED1 825 // VMALHF M0, MK0, ADD1, RED1H 826 // VMALF M1, MK0, ADD2, RED2 827 // VMALHF M1, MK0, ADD2, RED2H 828 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 829 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 830 831 VSLDOI $12, RED2, RED1, RED1 832 VSLDOI $12, T2, RED2, RED2 833 834 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 835 VADDUQM RED1, ADD1H, T0 // VAQ 836 VADDCUQ RED1H, T0, CAR1M // VACCQ 837 VADDUQM RED1H, T0, T0 // VAQ 838 839 // << ready for next MK0 840 841 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 842 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 843 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 844 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 845 VADDUQM CAR1, T2, T2 // VAQ 846 847 // --------------------------------------------------- 848 // VREPF $0, Y1, YDIG 849 VSPLTW $0, Y1, YDIG 850 851 // VMALF X0, YDIG, T0, ADD1 852 // VMALF X1, YDIG, T1, ADD2 853 // VMALHF X0, YDIG, T0, ADD1H 854 // VMALHF X1, YDIG, T1, ADD2H 855 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 856 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 857 858 // VMLF ADD1, K0, MK0 859 // VREPF $3, MK0, MK0 860 VMULUWM ADD1, K0, MK0 861 VSPLTW $3, MK0, MK0 862 863 // VMALF M0, MK0, ADD1, RED1 864 // VMALHF M0, MK0, ADD1, RED1H 865 // VMALF M1, MK0, ADD2, RED2 866 // VMALHF M1, MK0, ADD2, RED2H 867 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 868 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 869 870 VSLDOI $12, RED2, RED1, RED1 871 VSLDOI $12, T2, RED2, RED2 872 873 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 874 VADDUQM RED1, ADD1H, T0 // VAQ 875 VADDCUQ RED1H, T0, CAR1M // VACCQ 876 VADDUQM RED1H, T0, T0 // VAQ 877 878 // << ready for next MK0 879 880 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 881 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 882 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 883 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 884 VADDUQM CAR1, T2, T2 // VAQ 885 886 // --------------------------------------------------- 887 888 // VZERO RED1 889 // VSCBIQ M0, T0, CAR1 890 // VSQ M0, T0, ADD1 891 // VSBCBIQ T1, M1, CAR1, CAR1M 892 // VSBIQ T1, M1, CAR1, ADD2 893 // VSBIQ T2, RED1, CAR1M, T2 894 VSPLTISB $0, RED1 // VZERO RED1 895 VSUBCUQ T0, M0, CAR1 // VSCBIQ 896 VSUBUQM T0, M0, ADD1 // VSQ 897 VSUBECUQ T1, M1, CAR1, CAR1M // VSBCBIQ 898 VSUBEUQM T1, M1, CAR1, ADD2 // VSBIQ 899 VSUBEUQM T2, RED1, CAR1M, T2 // VSBIQ 900 901 // what output to use, ADD2||ADD1 or T1||T0? 902 VSEL ADD1, T0, T2, T0 903 VSEL ADD2, T1, T2, T1 904 RET 905 906 #undef X0 907 #undef X1 908 #undef Y0 909 #undef Y1 910 #undef M0 911 #undef M1 912 #undef T0 913 #undef T1 914 #undef T2 915 #undef YDIG 916 917 #undef ADD1 918 #undef ADD1H 919 #undef ADD2 920 #undef ADD2H 921 #undef RED1 922 #undef RED1H 923 #undef RED2 924 #undef RED2H 925 #undef CAR1 926 #undef CAR1M 927 928 #undef MK0 929 #undef K0 930 #undef TMP1 931 #undef TMP2 932 #undef ONE 933 934 // --------------------------------------- 935 936 // func p256OrdMul(res, in1, in2 *p256OrdElement) 937 #define res_ptr R3 938 #define x_ptr R4 939 #define y_ptr R5 940 #define CPOOL R7 941 #define N R8 942 943 #define X0 V0 944 #define X1 V1 945 #define Y0 V2 946 #define Y1 V3 947 #define M0 V5 948 #define M1 V4 949 #define T0 V6 950 #define T1 V7 951 #define K0 V31 952 TEXT ·p256OrdMul(SB), NOSPLIT, $0-24 953 MOVD res+0(FP), res_ptr 954 MOVD in1+8(FP), x_ptr 955 MOVD in2+16(FP), y_ptr 956 MOVD $16, R16 957 MOVD $32, R17 958 959 LXVD2X (R0)(x_ptr), X0 960 LXVD2X (R16)(x_ptr), X1 961 962 XXPERMDI X0, X0, $2, X0 963 XXPERMDI X1, X1, $2, X1 964 965 LXVD2X (R0)(y_ptr), Y0 966 LXVD2X (R16)(y_ptr), Y1 967 968 XXPERMDI Y0, Y0, $2, Y0 969 XXPERMDI Y1, Y1, $2, Y1 970 971 MOVD $p256ord<>+0x00(SB), CPOOL 972 LXVD2X (R16)(CPOOL), M0 973 LXVD2X (R0)(CPOOL), M1 974 LXVD2X (R17)(CPOOL), K0 // Can use VSPLTISW $0x72350975, K0 instead 975 976 CALL sm2p256OrdMulInternal<>(SB) 977 978 XXPERMDI T0, T0, $2, T0 979 XXPERMDI T1, T1, $2, T1 980 STXVD2X T0, (R0)(res_ptr) 981 STXVD2X T1, (R16)(res_ptr) 982 983 RET 984 985 // func p256OrdSqr(res, in *p256Element, n int) 986 TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24 987 MOVD res+0(FP), res_ptr 988 MOVD in+8(FP), x_ptr 989 MOVD n+16(FP), N 990 MOVD $16, R16 991 MOVD $32, R17 992 993 LXVD2X (R0)(x_ptr), X0 994 LXVD2X (R16)(x_ptr), X1 995 996 XXPERMDI X0, X0, $2, X0 997 XXPERMDI X1, X1, $2, X1 998 999 MOVD $p256ord<>+0x00(SB), CPOOL 1000 LXVD2X (R16)(CPOOL), M0 1001 LXVD2X (R0)(CPOOL), M1 1002 LXVD2X (R17)(CPOOL), K0 // Can use VSPLTISW $0x72350975, K0 instead 1003 1004 sqrOrdLoop: 1005 // Sqr uses same value for both 1006 1007 VOR X0, X0, Y0 1008 VOR X1, X1, Y1 1009 CALL sm2p256OrdMulInternal<>(SB) 1010 1011 ADD $-1, N 1012 CMP $0, N 1013 BEQ done 1014 1015 VOR T0, T0, X0 1016 VOR T1, T1, X1 1017 BR sqrOrdLoop 1018 1019 done: 1020 XXPERMDI T0, T0, $2, T0 1021 XXPERMDI T1, T1, $2, T1 1022 STXVD2X T0, (R0)(res_ptr) 1023 STXVD2X T1, (R16)(res_ptr) 1024 1025 RET 1026 1027 #undef res_ptr 1028 #undef x_ptr 1029 #undef y_ptr 1030 #undef CPOOL 1031 #undef N 1032 #undef X0 1033 #undef X1 1034 #undef Y0 1035 #undef Y1 1036 #undef M0 1037 #undef M1 1038 #undef T0 1039 #undef T1 1040 #undef K0 1041 1042 #define res_ptr R3 1043 #define x_ptr R4 1044 #define CPOOL R7 1045 1046 #define T0 V0 1047 #define T1 V1 1048 #define T2 V2 1049 #define TT0 V3 1050 #define TT1 V4 1051 1052 #define ZER V6 1053 #define SEL1 V7 1054 #define SEL2 V8 1055 #define CAR1 V9 1056 #define CAR2 V10 1057 #define RED1 V11 1058 #define RED2 V12 1059 #define PL V13 1060 #define PH V14 1061 1062 // func p256FromMont(res, in *p256Element) 1063 TEXT ·p256FromMont(SB), NOSPLIT, $0-16 1064 MOVD res+0(FP), res_ptr 1065 MOVD in+8(FP), x_ptr 1066 1067 MOVD $16, R16 1068 MOVD $32, R17 1069 MOVD $p256<>+0x00(SB), CPOOL 1070 1071 VSPLTISB $0, T2 // VZERO T2 1072 VSPLTISB $0, ZER // VZERO ZER 1073 1074 // Constants are defined so that the LXVD2X is correct 1075 LXVD2X (CPOOL+R0), PH 1076 LXVD2X (CPOOL+R16), PL 1077 1078 // VPERM byte selections 1079 LXVD2X (CPOOL+R17), SEL1 1080 1081 LXVD2X (R16)(x_ptr), T1 1082 LXVD2X (R0)(x_ptr), T0 1083 1084 // Put in true little endian order 1085 XXPERMDI T0, T0, $2, T0 1086 XXPERMDI T1, T1, $2, T1 1087 1088 // First round 1089 VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0 1090 VSLDOI $4, RED1, ZER, TT0 // 0 d1 d0 0 1091 VSLDOI $4, TT0, ZER, RED2 // d1 d0 0 0 1092 VSUBCUQ RED1, TT0, CAR1 // VSCBIQ TT0, RED1, CAR1 1093 VSUBUQM RED1, TT0, RED1 // VSQ TT0, RED1, RED1 1094 VSUBEUQM RED2, TT0, CAR1, RED2 // VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow 1095 1096 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 1097 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 1098 1099 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 1100 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 1101 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 1102 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 1103 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 1104 1105 // Second round 1106 VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0 1107 VSLDOI $4, RED1, ZER, TT0 // 0 d1 d0 0 1108 VSLDOI $4, TT0, ZER, RED2 // d1 d0 0 0 1109 VSUBCUQ RED1, TT0, CAR1 // VSCBIQ TT0, RED1, CAR1 1110 VSUBUQM RED1, TT0, RED1 // VSQ TT0, RED1, RED1 1111 VSUBEUQM RED2, TT0, CAR1, RED2 // VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow 1112 1113 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 1114 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 1115 1116 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 1117 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 1118 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 1119 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 1120 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 1121 1122 // Third round 1123 VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0 1124 VSLDOI $4, RED1, ZER, TT0 // 0 d1 d0 0 1125 VSLDOI $4, TT0, ZER, RED2 // d1 d0 0 0 1126 VSUBCUQ RED1, TT0, CAR1 // VSCBIQ TT0, RED1, CAR1 1127 VSUBUQM RED1, TT0, RED1 // VSQ TT0, RED1, RED1 1128 VSUBEUQM RED2, TT0, CAR1, RED2 // VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow 1129 1130 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 1131 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 1132 1133 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 1134 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 1135 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 1136 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 1137 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 1138 1139 // Last round 1140 VPERM ZER, T0, SEL1, RED1 // 0 0 d1 d0 1141 VSLDOI $4, RED1, ZER, TT0 // 0 d1 d0 0 1142 VSLDOI $4, TT0, ZER, RED2 // d1 d0 0 0 1143 VSUBCUQ RED1, TT0, CAR1 // VSCBIQ TT0, RED1, CAR1 1144 VSUBUQM RED1, TT0, RED1 // VSQ TT0, RED1, RED1 1145 VSUBEUQM RED2, TT0, CAR1, RED2 // VSBIQ RED2, TT0, CAR1, RED2 // Guaranteed not to underflow 1146 1147 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 1148 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 1149 1150 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 1151 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 1152 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 1153 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 1154 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 1155 1156 // --------------------------------------------------- 1157 1158 VSUBCUQ T0, PL, CAR1 // VSCBIQ PL, T0, CAR1 1159 VSUBUQM T0, PL, TT0 // VSQ PL, T0, TT0 1160 VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2 1161 VSUBEUQM T1, PH, CAR1, TT1 // VSBIQ T1, PH, CAR1, TT1 1162 VSUBEUQM T2, ZER, CAR2, T2 // VSBIQ T2, ZER, CAR2, T2 1163 1164 VSEL TT0, T0, T2, T0 1165 VSEL TT1, T1, T2, T1 1166 1167 // Reorder the bytes so STXVD2X can be used. 1168 // TT0, TT1 used for VPERM result in case 1169 // the caller expects T0, T1 to be good. 1170 XXPERMDI T0, T0, $2, TT0 1171 XXPERMDI T1, T1, $2, TT1 1172 1173 STXVD2X TT0, (R0)(res_ptr) 1174 STXVD2X TT1, (R16)(res_ptr) 1175 RET 1176 1177 #undef res_ptr 1178 #undef x_ptr 1179 #undef CPOOL 1180 #undef T0 1181 #undef T1 1182 #undef T2 1183 #undef TT0 1184 #undef TT1 1185 #undef ZER 1186 #undef SEL1 1187 #undef SEL2 1188 #undef CAR1 1189 #undef CAR2 1190 #undef RED1 1191 #undef RED2 1192 #undef PL 1193 #undef PH 1194 1195 //func p256OrdReduce(s *p256OrdElement) 1196 #define res_ptr R3 1197 #define CPOOL R4 1198 1199 #define T0 V0 1200 #define T1 V1 1201 #define T2 V2 1202 #define TT0 V3 1203 #define TT1 V4 1204 1205 #define ZER V6 1206 #define CAR1 V7 1207 #define CAR2 V8 1208 #define PL V9 1209 #define PH V10 1210 1211 TEXT ·p256OrdReduce(SB),NOSPLIT,$0 1212 MOVD res+0(FP), res_ptr 1213 MOVD $16, R16 1214 1215 VSPLTISB $0, T2 // VZERO T2 1216 VSPLTISB $0, ZER // VZERO ZER 1217 1218 MOVD $p256ord<>+0x00(SB), CPOOL 1219 LXVD2X (CPOOL+R0), PH 1220 LXVD2X (CPOOL+R16), PL 1221 1222 LXVD2X (R16)(res_ptr), T1 1223 LXVD2X (R0)(res_ptr), T0 1224 1225 // Put in true little endian order 1226 XXPERMDI T0, T0, $2, T0 1227 XXPERMDI T1, T1, $2, T1 1228 1229 VSUBCUQ T0, PL, CAR1 // VSCBIQ PL, T0, CAR1 1230 VSUBUQM T0, PL, TT0 // VSQ PL, T0, TT0 1231 VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2 1232 VSUBEUQM T1, PH, CAR1, TT1 // VSBIQ T1, PH, CAR1, TT1 1233 VSUBEUQM T2, ZER, CAR2, T2 // VSBIQ T2, ZER, CAR2, T2 1234 1235 VSEL TT0, T0, T2, T0 1236 VSEL TT1, T1, T2, T1 1237 1238 // Reorder the bytes so STXVD2X can be used. 1239 // TT0, TT1 used for VPERM result in case 1240 // the caller expects T0, T1 to be good. 1241 XXPERMDI T0, T0, $2, TT0 1242 XXPERMDI T1, T1, $2, TT1 1243 1244 STXVD2X TT0, (R0)(res_ptr) 1245 STXVD2X TT1, (R16)(res_ptr) 1246 1247 RET 1248 #undef res_ptr 1249 #undef CPOOL 1250 #undef T0 1251 #undef T1 1252 #undef T2 1253 #undef TT0 1254 #undef TT1 1255 #undef ZER 1256 #undef CAR1 1257 #undef CAR2 1258 #undef PL 1259 #undef PH 1260 1261 // --------------------------------------- 1262 // sm2p256MulInternal 1263 // V0-V3 V30,V31 - Not Modified 1264 // V4-V15 V28-V29 - Volatile 1265 1266 #define CPOOL R7 1267 1268 // Parameters 1269 #define X0 V0 // Not modified 1270 #define X1 V1 // Not modified 1271 #define Y0 V2 // Not modified 1272 #define Y1 V3 // Not modified 1273 #define T0 V4 // Result 1274 #define T1 V5 // Result 1275 #define P0 V30 // Not modified 1276 #define P1 V31 // Not modified 1277 1278 // Temporaries: lots of reused vector regs 1279 #define YDIG V6 // Overloaded with CAR2 1280 #define ADD1H V7 // Overloaded with ADD3H 1281 #define ADD2H V8 // Overloaded with ADD4H 1282 #define ADD3 V9 // Overloaded with SEL2,SEL5 1283 #define ADD4 V10 // Overloaded with SEL3,SEL6 1284 #define RED1 V11 // Overloaded with CAR2 1285 #define RED2 V12 // Overloaded with TMP2 1286 #define RED3 V13 // Overloaded with SEL1 1287 #define T2 V14 1288 // Overloaded temporaries 1289 #define ADD1 V4 // Overloaded with T0 1290 #define ADD2 V5 // Overloaded with T1 1291 #define ADD3H V7 // Overloaded with ADD1H 1292 #define ADD4H V8 // Overloaded with ADD2H 1293 #define ZER V28 // Overloaded with TMP1 1294 #define CAR1 V6 // Overloaded with YDIG 1295 #define CAR2 V11 // Overloaded with RED1 1296 // Constant Selects 1297 #define SEL1 V13 // Overloaded with RED3 1298 #define SEL2 V9 // Overloaded with ADD3,SEL5 1299 #define SEL3 V10 // Overloaded with ADD4,SEL6 1300 #define SEL4 V6 // Overloaded with YDIG,CAR1 1301 #define SEL5 V9 // Overloaded with ADD3,SEL2 1302 #define SEL6 V10 // Overloaded with ADD4,SEL3 1303 1304 // TMP1, TMP2 used in 1305 // VMULT macros 1306 #define TMP1 V13 // Overloaded with RED3 1307 #define TMP2 V12 // Overloaded with RED2 1308 #define ONE V29 // 1s splatted by word 1309 1310 TEXT sm2p256MulInternal<>(SB), NOSPLIT, $0-16 1311 // CPOOL loaded from caller 1312 MOVD $16, R16 1313 MOVD $32, R17 1314 MOVD $48, R18 1315 1316 // --------------------------------------------------- 1317 1318 VSPLTW $3, Y0, YDIG // VREPF Y0 is input 1319 1320 // VMLHF X0, YDIG, ADD1H 1321 // VMLHF X1, YDIG, ADD2H 1322 // VMLF X0, YDIG, ADD1 1323 // VMLF X1, YDIG, ADD2 1324 // 1325 VMULT(X0, YDIG, ADD1, ADD1H) 1326 VMULT(X1, YDIG, ADD2, ADD2H) 1327 1328 VSPLTISW $1, ONE 1329 VSPLTW $2, Y0, YDIG // VREPF 1330 1331 // VMALF X0, YDIG, ADD1H, ADD3 1332 // VMALF X1, YDIG, ADD2H, ADD4 1333 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 1334 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 1335 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) 1336 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) 1337 1338 LXVD2X (R17)(CPOOL), SEL1 1339 VSPLTISB $0, ZER // VZERO ZER 1340 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1341 1342 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB 1343 VSLDOI $12, ZER, ADD2, T1 // ADD2 Free // VSLDB 1344 1345 VADDCUQ T0, ADD3, CAR1 // VACCQ 1346 VADDUQM T0, ADD3, T0 // ADD3 Free // VAQ 1347 VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ 1348 VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free // VACQ 1349 1350 LXVD2X (R18)(CPOOL), SEL2 1351 VPERM RED3, T0, SEL2, RED1 // [ 0 0 d1 d0] 1352 VSLDOI $4, RED1, ZER, RED3 // [ 0 d1 d0 0] 1353 VSLDOI $4, RED3, ZER, RED2 // [d1 d0 0 0] 1354 VSUBCUQ RED1, RED3, CAR1 1355 VSUBUQM RED1, RED3, RED1 1356 VSUBEUQM RED2, RED3, CAR1, RED2 // Guaranteed not to underflow 1357 1358 VSLDOI $12, T1, T0, T0 // VSLDB 1359 VSLDOI $12, T2, T1, T1 // VSLDB 1360 1361 VADDCUQ T0, ADD3H, CAR1 // VACCQ 1362 VADDUQM T0, ADD3H, T0 // VAQ 1363 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 1364 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 1365 1366 VADDCUQ T0, RED1, CAR1 // VACCQ 1367 VADDUQM T0, RED1, T0 // VAQ 1368 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ 1369 VADDEUQM T1, RED2, CAR1, T1 // VACQ 1370 VADDUQM T2, CAR2, T2 // VAQ 1371 1372 // --------------------------------------------------- 1373 1374 VSPLTW $1, Y0, YDIG // VREPF 1375 1376 // VMALHF X0, YDIG, T0, ADD1H 1377 // VMALHF X1, YDIG, T1, ADD2H 1378 // VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1 1379 // VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2 1380 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 1381 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 1382 1383 VSPLTW $0, Y0, YDIG // VREPF 1384 1385 // VMALF X0, YDIG, ADD1H, ADD3 1386 // VMALF X1, YDIG, ADD2H, ADD4 1387 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H 1388 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER 1389 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) 1390 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) 1391 1392 VSPLTISB $0, ZER // VZERO ZER 1393 LXVD2X (R17)(CPOOL), SEL1 1394 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1395 1396 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0 // VSLDB 1397 VSLDOI $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free // VSLDB 1398 1399 VADDCUQ T0, ADD3, CAR1 // VACCQ 1400 VADDUQM T0, ADD3, T0 // VAQ 1401 VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ 1402 VADDEUQM T1, ADD4, CAR1, T1 // VACQ 1403 1404 LXVD2X (R18)(CPOOL), SEL2 1405 VPERM RED3, T0, SEL2, RED1 // [ 0 0 d1 d0] 1406 VSLDOI $4, RED1, ZER, RED3 // [ 0 d1 d0 0] 1407 VSLDOI $4, RED3, ZER, RED2 // [d1 d0 0 0] 1408 VSUBCUQ RED1, RED3, CAR1 1409 VSUBUQM RED1, RED3, RED1 1410 VSUBEUQM RED2, RED3, CAR1, RED2 // Guaranteed not to underflow 1411 1412 VSLDOI $12, T1, T0, T0 // VSLDB 1413 VSLDOI $12, T2, T1, T1 // VSLDB 1414 1415 VADDCUQ T0, ADD3H, CAR1 // VACCQ 1416 VADDUQM T0, ADD3H, T0 // VAQ 1417 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 1418 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 1419 1420 VADDCUQ T0, RED1, CAR1 // VACCQ 1421 VADDUQM T0, RED1, T0 // VAQ 1422 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ 1423 VADDEUQM T1, RED2, CAR1, T1 // VACQ 1424 VADDUQM T2, CAR2, T2 // VAQ 1425 // --------------------------------------------------- 1426 1427 VSPLTW $3, Y1, YDIG // VREPF 1428 1429 // VMALHF X0, YDIG, T0, ADD1H 1430 // VMALHF X1, YDIG, T1, ADD2H 1431 // VMALF X0, YDIG, T0, ADD1 1432 // VMALF X1, YDIG, T1, ADD2 1433 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 1434 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 1435 1436 VSPLTW $2, Y1, YDIG // VREPF 1437 1438 // VMALF X0, YDIG, ADD1H, ADD3 1439 // VMALF X1, YDIG, ADD2H, ADD4 1440 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 1441 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 1442 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) 1443 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) 1444 1445 VSPLTISB $0, ZER // VZERO ZER 1446 LXVD2X (R17)(CPOOL), SEL1 1447 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1448 1449 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB 1450 VSLDOI $12, T2, ADD2, T1 // ADD2 Free // VSLDB 1451 1452 VADDCUQ T0, ADD3, CAR1 // VACCQ 1453 VADDUQM T0, ADD3, T0 // VAQ 1454 VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ 1455 VADDEUQM T1, ADD4, CAR1, T1 // VACQ 1456 1457 LXVD2X (R18)(CPOOL), SEL2 1458 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1459 VSLDOI $4, RED1, ZER, RED3 // [ 0 d1 d0 0] 1460 VSLDOI $4, RED3, ZER, RED2 // [d1 d0 0 0] 1461 VSUBCUQ RED1, RED3, CAR1 1462 VSUBUQM RED1, RED3, RED1 1463 VSUBEUQM RED2, RED3, CAR1, RED2 // Guaranteed not to underflow 1464 1465 VSLDOI $12, T1, T0, T0 // VSLDB 1466 VSLDOI $12, T2, T1, T1 // VSLDB 1467 1468 VADDCUQ T0, ADD3H, CAR1 // VACCQ 1469 VADDUQM T0, ADD3H, T0 // VAQ 1470 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 1471 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 1472 1473 VADDCUQ T0, RED1, CAR1 // VACCQ 1474 VADDUQM T0, RED1, T0 // VAQ 1475 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ 1476 VADDEUQM T1, RED2, CAR1, T1 // VACQ 1477 VADDUQM T2, CAR2, T2 // VAQ 1478 // --------------------------------------------------- 1479 1480 VSPLTW $1, Y1, YDIG // VREPF 1481 1482 // VMALHF X0, YDIG, T0, ADD1H 1483 // VMALHF X1, YDIG, T1, ADD2H 1484 // VMALF X0, YDIG, T0, ADD1 1485 // VMALF X1, YDIG, T1, ADD2 1486 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 1487 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 1488 1489 VSPLTW $0, Y1, YDIG // VREPF 1490 1491 // VMALF X0, YDIG, ADD1H, ADD3 1492 // VMALF X1, YDIG, ADD2H, ADD4 1493 // VMALHF X0, YDIG, ADD1H, ADD3H 1494 // VMALHF X1, YDIG, ADD2H, ADD4H 1495 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H) 1496 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H) 1497 1498 VSPLTISB $0, ZER // VZERO ZER 1499 LXVD2X (R17)(CPOOL), SEL1 1500 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1501 1502 VSLDOI $12, ADD2, ADD1, T0 // VSLDB 1503 VSLDOI $12, T2, ADD2, T1 // VSLDB 1504 1505 VADDCUQ T0, ADD3, CAR1 // VACCQ 1506 VADDUQM T0, ADD3, T0 // VAQ 1507 VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ 1508 VADDEUQM T1, ADD4, CAR1, T1 // VACQ 1509 1510 LXVD2X (R18)(CPOOL), SEL2 1511 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1512 VSLDOI $4, RED1, ZER, RED3 // [ 0 d1 d0 0] 1513 VSLDOI $4, RED3, ZER, RED2 // [d1 d0 0 0] 1514 VSUBCUQ RED1, RED3, CAR1 1515 VSUBUQM RED1, RED3, RED1 1516 VSUBEUQM RED2, RED3, CAR1, RED2 // Guaranteed not to underflow 1517 1518 VSLDOI $12, T1, T0, T0 // VSLDB 1519 VSLDOI $12, T2, T1, T1 // VSLDB 1520 1521 VADDCUQ T0, ADD3H, CAR1 // VACCQ 1522 VADDUQM T0, ADD3H, T0 // VAQ 1523 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 1524 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 1525 1526 VADDCUQ T0, RED1, CAR1 // VACCQ 1527 VADDUQM T0, RED1, T0 // VAQ 1528 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ 1529 VADDEUQM T1, RED2, CAR1, T1 // VACQ 1530 VADDUQM T2, CAR2, T2 // VAQ 1531 1532 // --------------------------------------------------- 1533 1534 VSPLTISB $0, RED3 // VZERO RED3 1535 VSUBCUQ T0, P0, CAR1 // VSCBIQ 1536 VSUBUQM T0, P0, ADD1H // VSQ 1537 VSUBECUQ T1, P1, CAR1, CAR2 // VSBCBIQ 1538 VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ 1539 VSUBEUQM T2, RED3, CAR2, T2 // VSBIQ 1540 1541 // what output to use, ADD2H||ADD1H or T1||T0? 1542 VSEL ADD1H, T0, T2, T0 1543 VSEL ADD2H, T1, T2, T1 1544 RET 1545 1546 #undef CPOOL 1547 1548 #undef X0 1549 #undef X1 1550 #undef Y0 1551 #undef Y1 1552 #undef T0 1553 #undef T1 1554 #undef P0 1555 #undef P1 1556 1557 #undef SEL1 1558 #undef SEL2 1559 #undef SEL3 1560 #undef SEL4 1561 #undef SEL5 1562 #undef SEL6 1563 1564 #undef YDIG 1565 #undef ADD1H 1566 #undef ADD2H 1567 #undef ADD3 1568 #undef ADD4 1569 #undef RED1 1570 #undef RED2 1571 #undef RED3 1572 #undef T2 1573 #undef ADD1 1574 #undef ADD2 1575 #undef ADD3H 1576 #undef ADD4H 1577 #undef ZER 1578 #undef CAR1 1579 #undef CAR2 1580 1581 #undef TMP1 1582 #undef TMP2 1583 1584 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ 1585 VSPLTISB $0, ZER \ // VZERO 1586 VSUBCUQ X0, Y0, CAR1 \ 1587 VSUBUQM X0, Y0, T0 \ 1588 VSUBECUQ X1, Y1, CAR1, SEL1 \ 1589 VSUBEUQM X1, Y1, CAR1, T1 \ 1590 VSUBUQM ZER, SEL1, SEL1 \ // VSQ 1591 \ 1592 VADDCUQ T0, PL, CAR1 \ // VACCQ 1593 VADDUQM T0, PL, TT0 \ // VAQ 1594 VADDEUQM T1, PH, CAR1, TT1 \ // VACQ 1595 \ 1596 VSEL TT0, T0, SEL1, T0 \ 1597 VSEL TT1, T1, SEL1, T1 \ 1598 1599 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ 1600 VADDCUQ X0, Y0, CAR1 \ 1601 VADDUQM X0, Y0, T0 \ 1602 VADDECUQ X1, Y1, CAR1, T2 \ // VACCCQ 1603 VADDEUQM X1, Y1, CAR1, T1 \ 1604 \ 1605 VSPLTISB $0, ZER \ 1606 VSUBCUQ T0, PL, CAR1 \ // VSCBIQ 1607 VSUBUQM T0, PL, TT0 \ 1608 VSUBECUQ T1, PH, CAR1, CAR2 \ // VSBCBIQ 1609 VSUBEUQM T1, PH, CAR1, TT1 \ // VSBIQ 1610 VSUBEUQM T2, ZER, CAR2, SEL1 \ 1611 \ 1612 VSEL TT0, T0, SEL1, T0 \ 1613 VSEL TT1, T1, SEL1, T1 1614 1615 #define p256HalfInternal(T1, T0, X1, X0) \ 1616 VSPLTISB $0, ZER \ 1617 VSUBEUQM ZER, ZER, X0, SEL1 \ 1618 \ 1619 VADDCUQ X0, PL, CAR1 \ 1620 VADDUQM X0, PL, T0 \ 1621 VADDECUQ X1, PH, CAR1, T2 \ 1622 VADDEUQM X1, PH, CAR1, T1 \ 1623 \ 1624 VSEL T0, X0, SEL1, T0 \ 1625 VSEL T1, X1, SEL1, T1 \ 1626 VSEL T2, ZER, SEL1, T2 \ 1627 \ 1628 VSLDOI $15, T2, ZER, TT1 \ 1629 VSLDOI $15, T1, ZER, TT0 \ 1630 VSPLTISB $1, SEL1 \ 1631 VSR T0, SEL1, T0 \ // VSRL 1632 VSR T1, SEL1, T1 \ 1633 VSPLTISB $7, SEL1 \ // VREPIB 1634 VSL TT0, SEL1, TT0 \ 1635 VSL TT1, SEL1, TT1 \ 1636 VOR T0, TT0, T0 \ 1637 VOR T1, TT1, T1 1638 1639 #define res_ptr R3 1640 #define x_ptr R4 1641 #define y_ptr R5 1642 #define CPOOL R7 1643 #define TEMP R8 1644 #define N R9 1645 1646 // Parameters 1647 #define X0 V0 1648 #define X1 V1 1649 #define Y0 V2 1650 #define Y1 V3 1651 #define T0 V4 1652 #define T1 V5 1653 1654 // Constants 1655 #define P0 V30 1656 #define P1 V31 1657 // func p256MulAsm(res, in1, in2 *p256Element) 1658 TEXT ·p256Mul(SB), NOSPLIT, $0-24 1659 MOVD res+0(FP), res_ptr 1660 MOVD in1+8(FP), x_ptr 1661 MOVD in2+16(FP), y_ptr 1662 MOVD $16, R16 1663 MOVD $32, R17 1664 1665 MOVD $p256mul<>+0x00(SB), CPOOL 1666 1667 LXVD2X (R0)(x_ptr), X0 1668 LXVD2X (R16)(x_ptr), X1 1669 1670 XXPERMDI X0, X0, $2, X0 1671 XXPERMDI X1, X1, $2, X1 1672 1673 LXVD2X (R0)(y_ptr), Y0 1674 LXVD2X (R16)(y_ptr), Y1 1675 1676 XXPERMDI Y0, Y0, $2, Y0 1677 XXPERMDI Y1, Y1, $2, Y1 1678 1679 LXVD2X (R16)(CPOOL), P1 1680 LXVD2X (R0)(CPOOL), P0 1681 1682 CALL sm2p256MulInternal<>(SB) 1683 1684 MOVD $p256mul<>+0x00(SB), CPOOL // What's the purpose of this? 1685 1686 XXPERMDI T0, T0, $2, T0 1687 XXPERMDI T1, T1, $2, T1 1688 STXVD2X T0, (R0)(res_ptr) 1689 STXVD2X T1, (R16)(res_ptr) 1690 RET 1691 1692 // func p256Sqr(res, in *p256Element, n int) 1693 TEXT ·p256Sqr(SB), NOSPLIT, $0-24 1694 MOVD res+0(FP), res_ptr 1695 MOVD in+8(FP), x_ptr 1696 MOVD n+16(FP), N 1697 MOVD $16, R16 1698 MOVD $32, R17 1699 1700 MOVD $p256mul<>+0x00(SB), CPOOL 1701 LXVD2X (R16)(CPOOL), P1 1702 LXVD2X (R0)(CPOOL), P0 1703 1704 LXVD2X (R0)(x_ptr), X0 1705 LXVD2X (R16)(x_ptr), X1 1706 1707 XXPERMDI X0, X0, $2, X0 1708 XXPERMDI X1, X1, $2, X1 1709 1710 sqrLoop: 1711 // Sqr uses same value for both 1712 1713 VOR X0, X0, Y0 1714 VOR X1, X1, Y1 1715 1716 CALL sm2p256MulInternal<>(SB) 1717 1718 ADD $-1, N 1719 CMP $0, N 1720 BEQ done 1721 VOR T0, T0, X0 1722 VOR T1, T1, X1 1723 BR sqrLoop 1724 1725 done: 1726 XXPERMDI T0, T0, $2, T0 1727 XXPERMDI T1, T1, $2, T1 1728 STXVD2X T0, (R0)(res_ptr) 1729 STXVD2X T1, (R16)(res_ptr) 1730 RET 1731 1732 #undef res_ptr 1733 #undef x_ptr 1734 #undef y_ptr 1735 #undef CPOOL 1736 1737 #undef X0 1738 #undef X1 1739 #undef Y0 1740 #undef Y1 1741 #undef T0 1742 #undef T1 1743 #undef P0 1744 #undef P1 1745 1746 #define P3ptr R3 1747 #define P1ptr R4 1748 #define P2ptr R5 1749 #define CPOOL R7 1750 1751 // Temporaries in REGs 1752 #define Y2L V15 1753 #define Y2H V16 1754 #define T1L V17 1755 #define T1H V18 1756 #define T2L V19 1757 #define T2H V20 1758 #define T3L V21 1759 #define T3H V22 1760 #define T4L V23 1761 #define T4H V24 1762 1763 // Temps for Sub and Add 1764 #define TT0 V11 1765 #define TT1 V12 1766 #define T2 V13 1767 1768 // p256MulAsm Parameters 1769 #define X0 V0 1770 #define X1 V1 1771 #define Y0 V2 1772 #define Y1 V3 1773 #define T0 V4 1774 #define T1 V5 1775 1776 #define PL V30 1777 #define PH V31 1778 1779 // Names for zero/sel selects 1780 #define X1L V0 1781 #define X1H V1 1782 #define Y1L V2 // p256MulAsmParmY 1783 #define Y1H V3 // p256MulAsmParmY 1784 #define Z1L V4 1785 #define Z1H V5 1786 #define X2L V0 1787 #define X2H V1 1788 #define Z2L V4 1789 #define Z2H V5 1790 #define X3L V17 // T1L 1791 #define X3H V18 // T1H 1792 #define Y3L V21 // T3L 1793 #define Y3H V22 // T3H 1794 #define Z3L V25 1795 #define Z3H V26 1796 1797 #define ZER V6 1798 #define SEL1 V7 1799 #define CAR1 V8 1800 #define CAR2 V9 1801 /* * 1802 * Three operand formula: 1803 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1804 * T1 = Z1² 1805 * T2 = T1*Z1 1806 * T1 = T1*X2 1807 * T2 = T2*Y2 1808 * T1 = T1-X1 1809 * T2 = T2-Y1 1810 * Z3 = Z1*T1 1811 * T3 = T1² 1812 * T4 = T3*T1 1813 * T3 = T3*X1 1814 * T1 = 2*T3 1815 * X3 = T2² 1816 * X3 = X3-T1 1817 * X3 = X3-T4 1818 * T3 = T3-X3 1819 * T3 = T3*T2 1820 * T4 = T4*Y1 1821 * Y3 = T3-T4 1822 1823 * Three operand formulas, but with MulInternal X,Y used to store temps 1824 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1 1825 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2 1826 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2 1827 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2 1828 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1829 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1830 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2 1831 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2 1832 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4 1833 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4 1834 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1835 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4 1836 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 1837 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1838 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1839 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4 1840 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4 1841 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 1842 */ 1843 // 1844 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) 1845 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 1846 MOVD res+0(FP), P3ptr 1847 MOVD in1+8(FP), P1ptr 1848 MOVD in2+16(FP), P2ptr 1849 1850 MOVD $p256mul<>+0x00(SB), CPOOL 1851 1852 MOVD $16, R16 1853 MOVD $32, R17 1854 MOVD $48, R18 1855 MOVD $64, R19 1856 MOVD $80, R20 1857 MOVD $96, R21 1858 MOVD $112, R22 1859 MOVD $128, R23 1860 MOVD $144, R24 1861 MOVD $160, R25 1862 MOVD $88, R26 // offset of sign+24(FP): 24 + 64 1863 1864 LXVD2X (R16)(CPOOL), PH 1865 LXVD2X (R0)(CPOOL), PL 1866 1867 LXVD2X (R17)(P2ptr), Y2L 1868 LXVD2X (R18)(P2ptr), Y2H 1869 XXPERMDI Y2H, Y2H, $2, Y2H 1870 XXPERMDI Y2L, Y2L, $2, Y2L 1871 1872 // Equivalent of VLREPG sign+24(FP), SEL1 1873 LXVDSX (R1)(R26), SEL1 1874 VSPLTISB $0, ZER 1875 VCMPEQUD SEL1, ZER, SEL1 1876 1877 VSUBCUQ PL, Y2L, CAR1 1878 VSUBUQM PL, Y2L, T1L 1879 VSUBEUQM PH, Y2H, CAR1, T1H 1880 1881 VSEL T1L, Y2L, SEL1, Y2L 1882 VSEL T1H, Y2H, SEL1, Y2H 1883 1884 /* * 1885 * Three operand formula: 1886 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1887 */ 1888 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 1889 LXVD2X (R19)(P1ptr), X0 // Z1H 1890 LXVD2X (R20)(P1ptr), X1 // Z1L 1891 XXPERMDI X0, X0, $2, X0 1892 XXPERMDI X1, X1, $2, X1 1893 VOR X0, X0, Y0 1894 VOR X1, X1, Y1 1895 CALL sm2p256MulInternal<>(SB) 1896 1897 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 1898 VOR T0, T0, X0 1899 VOR T1, T1, X1 1900 CALL sm2p256MulInternal<>(SB) 1901 VOR T0, T0, T2L 1902 VOR T1, T1, T2H 1903 1904 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 1905 MOVD in2+16(FP), P2ptr 1906 LXVD2X (R0)(P2ptr), Y0 // X2H 1907 LXVD2X (R16)(P2ptr), Y1 // X2L 1908 XXPERMDI Y0, Y0, $2, Y0 1909 XXPERMDI Y1, Y1, $2, Y1 1910 CALL sm2p256MulInternal<>(SB) 1911 VOR T0, T0, T1L 1912 VOR T1, T1, T1H 1913 1914 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2 1915 VOR T2L, T2L, X0 1916 VOR T2H, T2H, X1 1917 VOR Y2L, Y2L, Y0 1918 VOR Y2H, Y2H, Y1 1919 CALL sm2p256MulInternal<>(SB) 1920 1921 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1922 MOVD in1+8(FP), P1ptr 1923 LXVD2X (R17)(P1ptr), Y1L 1924 LXVD2X (R18)(P1ptr), Y1H 1925 XXPERMDI Y1H, Y1H, $2, Y1H 1926 XXPERMDI Y1L, Y1L, $2, Y1L 1927 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L) 1928 1929 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1930 LXVD2X (R0)(P1ptr), X1L 1931 LXVD2X (R16)(P1ptr), X1H 1932 XXPERMDI X1H, X1H, $2, X1H 1933 XXPERMDI X1L, X1L, $2, X1L 1934 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L) 1935 1936 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2 1937 LXVD2X (R19)(P1ptr), X0 // Z1H 1938 LXVD2X (R20)(P1ptr), X1 // Z1L 1939 XXPERMDI X0, X0, $2, X0 1940 XXPERMDI X1, X1, $2, X1 1941 CALL sm2p256MulInternal<>(SB) 1942 1943 VOR T0, T0, Z3L 1944 VOR T1, T1, Z3H 1945 1946 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 1947 VOR Y0, Y0, X0 1948 VOR Y1, Y1, X1 1949 CALL sm2p256MulInternal<>(SB) 1950 VOR T0, T0, X0 1951 VOR T1, T1, X1 1952 1953 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4 1954 CALL sm2p256MulInternal<>(SB) 1955 VOR T0, T0, T4L 1956 VOR T1, T1, T4H 1957 1958 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 1959 MOVD in1+8(FP), P1ptr 1960 LXVD2X (R0)(P1ptr), Y0 // X1H 1961 LXVD2X (R16)(P1ptr), Y1 // X1L 1962 XXPERMDI Y1, Y1, $2, Y1 1963 XXPERMDI Y0, Y0, $2, Y0 1964 CALL sm2p256MulInternal<>(SB) 1965 VOR T0, T0, T3L 1966 VOR T1, T1, T3H 1967 1968 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1969 p256AddInternal(T1H,T1L, T1,T0,T1,T0) 1970 1971 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4 1972 VOR T2L, T2L, X0 1973 VOR T2H, T2H, X1 1974 VOR T2L, T2L, Y0 1975 VOR T2H, T2H, Y1 1976 CALL sm2p256MulInternal<>(SB) 1977 1978 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3) 1979 p256SubInternal(T1,T0,T1,T0,T1H,T1L) 1980 1981 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1982 p256SubInternal(T1,T0,T1,T0,T4H,T4L) 1983 VOR T0, T0, X3L 1984 VOR T1, T1, X3H 1985 1986 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1987 p256SubInternal(X1,X0,T3H,T3L,T1,T0) 1988 1989 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4 1990 CALL sm2p256MulInternal<>(SB) 1991 VOR T0, T0, T3L 1992 VOR T1, T1, T3H 1993 1994 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 1995 VOR T4L, T4L, X0 1996 VOR T4H, T4H, X1 1997 MOVD in1+8(FP), P1ptr 1998 LXVD2X (R17)(P1ptr), Y0 // Y1H 1999 LXVD2X (R18)(P1ptr), Y1 // Y1L 2000 XXPERMDI Y0, Y0, $2, Y0 2001 XXPERMDI Y1, Y1, $2, Y1 2002 CALL sm2p256MulInternal<>(SB) 2003 2004 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3) 2005 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0) 2006 2007 // if (sel == 0) { 2008 // copy(P3.x[:], X1) 2009 // copy(P3.y[:], Y1) 2010 // copy(P3.z[:], Z1) 2011 // } 2012 2013 LXVD2X (R0)(P1ptr), X1L 2014 LXVD2X (R16)(P1ptr), X1H 2015 XXPERMDI X1H, X1H, $2, X1H 2016 XXPERMDI X1L, X1L, $2, X1L 2017 2018 // Y1 already loaded, left over from addition 2019 LXVD2X (R19)(P1ptr), Z1L 2020 LXVD2X (R20)(P1ptr), Z1H 2021 XXPERMDI Z1H, Z1H, $2, Z1H 2022 XXPERMDI Z1L, Z1L, $2, Z1L 2023 2024 LXVDSX (R1)(R21), SEL1 // Get offset to sel+32 2025 VSPLTISB $0, ZER 2026 VCMPEQUD SEL1, ZER, SEL1 2027 2028 VSEL X3L, X1L, SEL1, X3L 2029 VSEL X3H, X1H, SEL1, X3H 2030 VSEL Y3L, Y1L, SEL1, Y3L 2031 VSEL Y3H, Y1H, SEL1, Y3H 2032 VSEL Z3L, Z1L, SEL1, Z3L 2033 VSEL Z3H, Z1H, SEL1, Z3H 2034 2035 MOVD in2+16(FP), P2ptr 2036 LXVD2X (R0)(P2ptr), X2L 2037 LXVD2X (R16)(P2ptr), X2H 2038 XXPERMDI X2H, X2H, $2, X2H 2039 XXPERMDI X2L, X2L, $2, X2L 2040 2041 // Y2 already loaded 2042 LXVD2X (R19)(CPOOL), Z2L 2043 LXVD2X (R20)(CPOOL), Z2H 2044 2045 LXVDSX (R1)(R22), SEL1 // Get the value from zero+40(FP) 2046 VSPLTISB $0, ZER 2047 VCMPEQUD SEL1, ZER, SEL1 2048 2049 VSEL X3L, X2L, SEL1, X3L 2050 VSEL X3H, X2H, SEL1, X3H 2051 VSEL Y3L, Y2L, SEL1, Y3L 2052 VSEL Y3H, Y2H, SEL1, Y3H 2053 VSEL Z3L, Z2L, SEL1, Z3L 2054 VSEL Z3H, Z2H, SEL1, Z3H 2055 2056 // Reorder the bytes so they can be stored using STXVD2X. 2057 MOVD res+0(FP), P3ptr 2058 XXPERMDI X3H, X3H, $2, X3H 2059 XXPERMDI X3L, X3L, $2, X3L 2060 XXPERMDI Y3H, Y3H, $2, Y3H 2061 XXPERMDI Y3L, Y3L, $2, Y3L 2062 XXPERMDI Z3H, Z3H, $2, Z3H 2063 XXPERMDI Z3L, Z3L, $2, Z3L 2064 STXVD2X X3L, (R0)(P3ptr) 2065 STXVD2X X3H, (R16)(P3ptr) 2066 STXVD2X Y3L, (R17)(P3ptr) 2067 STXVD2X Y3H, (R18)(P3ptr) 2068 STXVD2X Z3L, (R19)(P3ptr) 2069 STXVD2X Z3H, (R20)(P3ptr) 2070 2071 RET 2072 2073 #undef P3ptr 2074 #undef P1ptr 2075 #undef P2ptr 2076 #undef CPOOL 2077 2078 #undef Y2L 2079 #undef Y2H 2080 #undef T1L 2081 #undef T1H 2082 #undef T2L 2083 #undef T2H 2084 #undef T3L 2085 #undef T3H 2086 #undef T4L 2087 #undef T4H 2088 2089 #undef TT0 2090 #undef TT1 2091 #undef T2 2092 2093 #undef X0 2094 #undef X1 2095 #undef Y0 2096 #undef Y1 2097 #undef T0 2098 #undef T1 2099 2100 #undef PL 2101 #undef PH 2102 2103 #undef X1L 2104 #undef X1H 2105 #undef Y1L 2106 #undef Y1H 2107 #undef Z1L 2108 #undef Z1H 2109 #undef X2L 2110 #undef X2H 2111 #undef Z2L 2112 #undef Z2H 2113 #undef X3L 2114 #undef X3H 2115 #undef Y3L 2116 #undef Y3H 2117 #undef Z3L 2118 #undef Z3H 2119 2120 #undef ZER 2121 #undef SEL1 2122 #undef CAR1 2123 #undef CAR2 2124 2125 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl 2126 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html 2127 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html 2128 #define P3ptr R3 2129 #define P1ptr R4 2130 #define CPOOL R7 2131 2132 // Temporaries in REGs 2133 #define X3L V15 2134 #define X3H V16 2135 #define Y3L V17 2136 #define Y3H V18 2137 #define T1L V19 2138 #define T1H V20 2139 #define T2L V21 2140 #define T2H V22 2141 #define T3L V23 2142 #define T3H V24 2143 2144 #define X1L V6 2145 #define X1H V7 2146 #define Y1L V8 2147 #define Y1H V9 2148 #define Z1L V10 2149 #define Z1H V11 2150 2151 // Temps for Sub and Add 2152 #define TT0 V11 2153 #define TT1 V12 2154 #define T2 V13 2155 2156 // p256MulAsm Parameters 2157 #define X0 V0 2158 #define X1 V1 2159 #define Y0 V2 2160 #define Y1 V3 2161 #define T0 V4 2162 #define T1 V5 2163 2164 #define PL V30 2165 #define PH V31 2166 2167 #define Z3L V23 2168 #define Z3H V24 2169 2170 #define ZER V26 2171 #define SEL1 V27 2172 #define CAR1 V28 2173 #define CAR2 V29 2174 /* 2175 * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv 2176 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3. 2177 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 2178 * A = 3(X₁-Z₁²)×(X₁+Z₁²) 2179 * B = 2Y₁ 2180 * Z₃ = B×Z₁ 2181 * C = B² 2182 * D = C×X₁ 2183 * X₃ = A²-2D 2184 * Y₃ = (D-X₃)×A-C²/2 2185 * 2186 * Three-operand formula: 2187 * T1 = Z1² 2188 * T2 = X1-T1 2189 * T1 = X1+T1 2190 * T2 = T2*T1 2191 * T2 = 3*T2 2192 * Y3 = 2*Y1 2193 * Z3 = Y3*Z1 2194 * Y3 = Y3² 2195 * T3 = Y3*X1 2196 * Y3 = Y3² 2197 * Y3 = half*Y3 2198 * X3 = T2² 2199 * T1 = 2*T3 2200 * X3 = X3-T1 2201 * T1 = T3-X3 2202 * T1 = T1*T2 2203 * Y3 = T1-Y3 2204 */ 2205 2206 #define p256PointDoubleRound(P1ptr, P3ptr) \ 2207 \// X=Z1; Y=Z1; MUL; T- // T1 = Z1² 2208 LXVD2X (R19)(P1ptr), X0 \ // Z1H 2209 LXVD2X (R20)(P1ptr), X1 \ // Z1L 2210 \ 2211 XXPERMDI X0, X0, $2, X0 \ 2212 XXPERMDI X1, X1, $2, X1 \ 2213 \ 2214 VOR X0, X0, Y0 \ 2215 VOR X1, X1, Y1 \ 2216 CALL sm2p256MulInternal<>(SB) \ 2217 \ 2218 \// SUB(X<X1-T) // T2 = X1-T1 2219 LXVD2X (R0)(P1ptr), X1L \ 2220 LXVD2X (R16)(P1ptr), X1H \ 2221 XXPERMDI X1L, X1L, $2, X1L \ 2222 XXPERMDI X1H, X1H, $2, X1H \ 2223 \ 2224 p256SubInternal(X1,X0,X1H,X1L,T1,T0) \ 2225 \ 2226 \ // ADD(Y<X1+T) // T1 = X1+T1 2227 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0) \ 2228 \ 2229 \ // X- ; Y- ; MUL; T- // T2 = T2*T1 2230 CALL sm2p256MulInternal<>(SB) \ 2231 \ 2232 \ // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2 2233 p256AddInternal(T2H,T2L,T1,T0,T1,T0) \ 2234 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0) \ 2235 \ 2236 \ // ADD(X<Y1+Y1) // Y3 = 2*Y1 2237 LXVD2X (R17)(P1ptr), Y1L \ 2238 LXVD2X (R18)(P1ptr), Y1H \ 2239 XXPERMDI Y1L, Y1L, $2, Y1L \ 2240 XXPERMDI Y1H, Y1H, $2, Y1H \ 2241 \ 2242 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L) \ 2243 \ 2244 \ // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1 2245 LXVD2X (R19)(P1ptr), Y0 \ 2246 LXVD2X (R20)(P1ptr), Y1 \ 2247 XXPERMDI Y0, Y0, $2, Y0 \ 2248 XXPERMDI Y1, Y1, $2, Y1 \ 2249 \ 2250 CALL sm2p256MulInternal<>(SB) \ 2251 \ 2252 \ // Leave T0, T1 as is. 2253 XXPERMDI T0, T0, $2, TT0 \ 2254 XXPERMDI T1, T1, $2, TT1 \ 2255 STXVD2X TT0, (R19)(P3ptr) \ 2256 STXVD2X TT1, (R20)(P3ptr) \ 2257 \ 2258 \ // X- ; Y=X ; MUL; T- // Y3 = Y3² 2259 VOR X0, X0, Y0 \ 2260 VOR X1, X1, Y1 \ 2261 CALL sm2p256MulInternal<>(SB) \ 2262 \ 2263 \ // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 2264 VOR T0, T0, X0 \ 2265 VOR T1, T1, X1 \ 2266 LXVD2X (R0)(P1ptr), Y0 \ 2267 LXVD2X (R16)(P1ptr), Y1 \ 2268 XXPERMDI Y0, Y0, $2, Y0 \ 2269 XXPERMDI Y1, Y1, $2, Y1 \ 2270 CALL sm2p256MulInternal<>(SB) \ 2271 VOR T0, T0, T3L \ 2272 VOR T1, T1, T3H \ 2273 \ 2274 \ // X- ; Y=X ; MUL; T- // Y3 = Y3² 2275 VOR X0, X0, Y0 \ 2276 VOR X1, X1, Y1 \ 2277 CALL sm2p256MulInternal<>(SB) \ 2278 \ 2279 \ // HAL(Y3<T) // Y3 = half*Y3 2280 p256HalfInternal(Y3H,Y3L, T1,T0) \ 2281 \ 2282 \ // X=T2; Y=T2; MUL; T- // X3 = T2² 2283 VOR T2L, T2L, X0 \ 2284 VOR T2H, T2H, X1 \ 2285 VOR T2L, T2L, Y0 \ 2286 VOR T2H, T2H, Y1 \ 2287 CALL sm2p256MulInternal<>(SB) \ 2288 \ 2289 \ // ADD(T1<T3+T3) // T1 = 2*T3 2290 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L) \ 2291 \ 2292 \ // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1 2293 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L) \ 2294 \ 2295 XXPERMDI X3L, X3L, $2, TT0 \ 2296 XXPERMDI X3H, X3H, $2, TT1 \ 2297 STXVD2X TT0, (R0)(P3ptr) \ 2298 STXVD2X TT1, (R16)(P3ptr) \ 2299 \ 2300 \ // SUB(X<T3-X3) // T1 = T3-X3 2301 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L) \ 2302 \ 2303 \ // X- ; Y- ; MUL; T- // T1 = T1*T2 2304 CALL sm2p256MulInternal<>(SB) \ 2305 \ 2306 \ // SUB(Y3<T-Y3) // Y3 = T1-Y3 2307 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L) \ 2308 \ 2309 XXPERMDI Y3L, Y3L, $2, Y3L \ 2310 XXPERMDI Y3H, Y3H, $2, Y3H \ 2311 STXVD2X Y3L, (R17)(P3ptr) \ 2312 STXVD2X Y3H, (R18)(P3ptr) \ 2313 2314 // p256PointDoubleAsm(res, in1 *p256Point) 2315 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0-16 2316 MOVD res+0(FP), P3ptr 2317 MOVD in+8(FP), P1ptr 2318 2319 MOVD $p256mul<>+0x00(SB), CPOOL 2320 2321 MOVD $16, R16 2322 MOVD $32, R17 2323 MOVD $48, R18 2324 MOVD $64, R19 2325 MOVD $80, R20 2326 2327 LXVD2X (R16)(CPOOL), PH 2328 LXVD2X (R0)(CPOOL), PL 2329 2330 p256PointDoubleRound(P1ptr, P3ptr) 2331 RET 2332 2333 TEXT ·p256PointDouble6TimesAsm(SB), NOSPLIT, $0-16 2334 MOVD res+0(FP), P3ptr 2335 MOVD in+8(FP), P1ptr 2336 2337 MOVD $p256mul<>+0x00(SB), CPOOL 2338 2339 MOVD $16, R16 2340 MOVD $32, R17 2341 MOVD $48, R18 2342 MOVD $64, R19 2343 MOVD $80, R20 2344 2345 LXVD2X (R16)(CPOOL), PH 2346 LXVD2X (R0)(CPOOL), PL 2347 2348 p256PointDoubleRound(P1ptr, P3ptr) 2349 p256PointDoubleRound(P3ptr, P3ptr) 2350 p256PointDoubleRound(P3ptr, P3ptr) 2351 p256PointDoubleRound(P3ptr, P3ptr) 2352 p256PointDoubleRound(P3ptr, P3ptr) 2353 p256PointDoubleRound(P3ptr, P3ptr) 2354 RET 2355 2356 #undef P3ptr 2357 #undef P1ptr 2358 #undef CPOOL 2359 #undef X3L 2360 #undef X3H 2361 #undef Y3L 2362 #undef Y3H 2363 #undef T1L 2364 #undef T1H 2365 #undef T2L 2366 #undef T2H 2367 #undef T3L 2368 #undef T3H 2369 #undef X1L 2370 #undef X1H 2371 #undef Y1L 2372 #undef Y1H 2373 #undef Z1L 2374 #undef Z1H 2375 #undef TT0 2376 #undef TT1 2377 #undef T2 2378 #undef X0 2379 #undef X1 2380 #undef Y0 2381 #undef Y1 2382 #undef T0 2383 #undef T1 2384 #undef PL 2385 #undef PH 2386 #undef Z3L 2387 #undef Z3H 2388 #undef ZER 2389 #undef SEL1 2390 #undef CAR1 2391 #undef CAR2 2392 2393 #define P3ptr R3 2394 #define P1ptr R4 2395 #define P2ptr R5 2396 #define CPOOL R7 2397 #define TRUE R14 2398 #define RES1 R9 2399 #define RES2 R10 2400 2401 // Temporaries in REGs 2402 #define T1L V16 2403 #define T1H V17 2404 #define T2L V18 2405 #define T2H V19 2406 #define U1L V20 2407 #define U1H V21 2408 #define S1L V22 2409 #define S1H V23 2410 #define HL V24 2411 #define HH V25 2412 #define RL V26 2413 #define RH V27 2414 2415 // Temps for Sub and Add 2416 #define ZER V6 2417 #define SEL1 V7 2418 #define CAR1 V8 2419 #define CAR2 V9 2420 #define TT0 V11 2421 #define TT1 V12 2422 #define T2 V13 2423 2424 // p256MulAsm Parameters 2425 #define X0 V0 2426 #define X1 V1 2427 #define Y0 V2 2428 #define Y1 V3 2429 #define T0 V4 2430 #define T1 V5 2431 2432 #define PL V30 2433 #define PH V31 2434 /* 2435 * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields" 2436 * 2437 * A = X₁×Z₂² 2438 * B = Y₁×Z₂³ 2439 * C = X₂×Z₁²-A 2440 * D = Y₂×Z₁³-B 2441 * X₃ = D² - 2A×C² - C³ 2442 * Y₃ = D×(A×C² - X₃) - B×C³ 2443 * Z₃ = Z₁×Z₂×C 2444 * 2445 * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2 2446 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R 2447 * 2448 * T1 = Z1*Z1 2449 * T2 = Z2*Z2 2450 * U1 = X1*T2 2451 * H = X2*T1 2452 * H = H-U1 2453 * Z3 = Z1*Z2 2454 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 2455 * 2456 * S1 = Z2*T2 2457 * S1 = Y1*S1 2458 * R = Z1*T1 2459 * R = Y2*R 2460 * R = R-S1 2461 * 2462 * T1 = H*H 2463 * T2 = H*T1 2464 * U1 = U1*T1 2465 * 2466 * X3 = R*R 2467 * X3 = X3-T2 2468 * T1 = 2*U1 2469 * X3 = X3-T1 << store-out X3 result reg 2470 * 2471 * T2 = S1*T2 2472 * Y3 = U1-X3 2473 * Y3 = R*Y3 2474 * Y3 = Y3-T2 << store-out Y3 result reg 2475 2476 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 2477 // X- ; Y=T ; MUL; R=T // R = Z1*T1 2478 // X=X2; Y- ; MUL; H=T // H = X2*T1 2479 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2480 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2481 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2482 // SUB(H<H-T) // H = H-U1 2483 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2484 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 2485 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2486 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2487 // SUB(R<T-S1) // R = R-S1 2488 // X=H ; Y=H ; MUL; T- // T1 = H*H 2489 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2490 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2491 // X=R ; Y=R ; MUL; T- // X3 = R*R 2492 // SUB(T<T-T2) // X3 = X3-T2 2493 // ADD(X<U1+U1) // T1 = 2*U1 2494 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2495 // SUB(Y<U1-T) // Y3 = U1-X3 2496 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2497 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2498 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2499 */ 2500 // p256PointAddAsm(res, in1, in2 *p256Point) 2501 TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 2502 MOVD res+0(FP), P3ptr 2503 MOVD in1+8(FP), P1ptr 2504 MOVD $p256mul<>+0x00(SB), CPOOL 2505 MOVD $16, R16 2506 MOVD $32, R17 2507 MOVD $48, R18 2508 MOVD $64, R19 2509 MOVD $80, R20 2510 2511 LXVD2X (R16)(CPOOL), PH 2512 LXVD2X (R0)(CPOOL), PL 2513 2514 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 2515 LXVD2X (R19)(P1ptr), X0 // Z1L 2516 LXVD2X (R20)(P1ptr), X1 // Z1H 2517 XXPERMDI X0, X0, $2, X0 2518 XXPERMDI X1, X1, $2, X1 2519 VOR X0, X0, Y0 2520 VOR X1, X1, Y1 2521 CALL sm2p256MulInternal<>(SB) 2522 2523 // X- ; Y=T ; MUL; R=T // R = Z1*T1 2524 VOR T0, T0, Y0 2525 VOR T1, T1, Y1 2526 CALL sm2p256MulInternal<>(SB) 2527 VOR T0, T0, RL // SAVE: RL 2528 VOR T1, T1, RH // SAVE: RH 2529 2530 // X=X2; Y- ; MUL; H=T // H = X2*T1 2531 MOVD in2+16(FP), P2ptr 2532 LXVD2X (R0)(P2ptr), X0 // X2L 2533 LXVD2X (R16)(P2ptr), X1 // X2H 2534 XXPERMDI X0, X0, $2, X0 2535 XXPERMDI X1, X1, $2, X1 2536 CALL sm2p256MulInternal<>(SB) 2537 VOR T0, T0, HL // SAVE: HL 2538 VOR T1, T1, HH // SAVE: HH 2539 2540 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2541 MOVD in2+16(FP), P2ptr 2542 LXVD2X (R19)(P2ptr), X0 // Z2L 2543 LXVD2X (R20)(P2ptr), X1 // Z2H 2544 XXPERMDI X0, X0, $2, X0 2545 XXPERMDI X1, X1, $2, X1 2546 VOR X0, X0, Y0 2547 VOR X1, X1, Y1 2548 CALL sm2p256MulInternal<>(SB) 2549 2550 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2551 VOR T0, T0, Y0 2552 VOR T1, T1, Y1 2553 CALL sm2p256MulInternal<>(SB) 2554 VOR T0, T0, S1L // SAVE: S1L 2555 VOR T1, T1, S1H // SAVE: S1H 2556 2557 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2558 MOVD in1+8(FP), P1ptr 2559 LXVD2X (R0)(P1ptr), X0 // X1L 2560 LXVD2X (R16)(P1ptr), X1 // X1H 2561 XXPERMDI X0, X0, $2, X0 2562 XXPERMDI X1, X1, $2, X1 2563 CALL sm2p256MulInternal<>(SB) 2564 VOR T0, T0, U1L // SAVE: U1L 2565 VOR T1, T1, U1H // SAVE: U1H 2566 2567 // SUB(H<H-T) // H = H-U1 2568 p256SubInternal(HH,HL,HH,HL,T1,T0) 2569 2570 // if H == 0 or H^P == 0 then ret=1 else ret=0 2571 // clobbers T1H and T1L 2572 MOVD $1, TRUE 2573 VSPLTISB $0, ZER 2574 VOR HL, HH, T1H 2575 VCMPEQUDCC ZER, T1H, T1H 2576 2577 // 26 = CR6 NE 2578 ISEL $26, R0, TRUE, RES1 2579 VXOR HL, PL, T1L // SAVE: T1L 2580 VXOR HH, PH, T1H // SAVE: T1H 2581 VOR T1L, T1H, T1H 2582 VCMPEQUDCC ZER, T1H, T1H 2583 2584 // 26 = CR6 NE 2585 ISEL $26, R0, TRUE, RES2 2586 OR RES2, RES1, RES1 2587 MOVD RES1, ret+24(FP) 2588 2589 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2590 MOVD in1+8(FP), P1ptr 2591 MOVD in2+16(FP), P2ptr 2592 LXVD2X (R19)(P1ptr), X0 // Z1L 2593 LXVD2X (R20)(P1ptr), X1 // Z1H 2594 XXPERMDI X0, X0, $2, X0 2595 XXPERMDI X1, X1, $2, X1 2596 LXVD2X (R19)(P2ptr), Y0 // Z2L 2597 LXVD2X (R20)(P2ptr), Y1 // Z2H 2598 XXPERMDI Y0, Y0, $2, Y0 2599 XXPERMDI Y1, Y1, $2, Y1 2600 CALL sm2p256MulInternal<>(SB) 2601 2602 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H 2603 VOR T0, T0, X0 2604 VOR T1, T1, X1 2605 VOR HL, HL, Y0 2606 VOR HH, HH, Y1 2607 CALL sm2p256MulInternal<>(SB) 2608 MOVD res+0(FP), P3ptr 2609 XXPERMDI T1, T1, $2, TT1 2610 XXPERMDI T0, T0, $2, TT0 2611 STXVD2X TT0, (R19)(P3ptr) 2612 STXVD2X TT1, (R20)(P3ptr) 2613 2614 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2615 MOVD in1+8(FP), P1ptr 2616 LXVD2X (R17)(P1ptr), X0 2617 LXVD2X (R18)(P1ptr), X1 2618 XXPERMDI X0, X0, $2, X0 2619 XXPERMDI X1, X1, $2, X1 2620 VOR S1L, S1L, Y0 2621 VOR S1H, S1H, Y1 2622 CALL sm2p256MulInternal<>(SB) 2623 VOR T0, T0, S1L 2624 VOR T1, T1, S1H 2625 2626 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2627 MOVD in2+16(FP), P2ptr 2628 LXVD2X (R17)(P2ptr), X0 2629 LXVD2X (R18)(P2ptr), X1 2630 XXPERMDI X0, X0, $2, X0 2631 XXPERMDI X1, X1, $2, X1 2632 VOR RL, RL, Y0 2633 2634 VOR RH, RH, Y1 2635 CALL sm2p256MulInternal<>(SB) 2636 2637 // SUB(R<T-S1) // R = T-S1 2638 p256SubInternal(RH,RL,T1,T0,S1H,S1L) 2639 2640 // if R == 0 or R^P == 0 then ret=ret else ret=0 2641 // clobbers T1H and T1L 2642 // Redo this using ISEL?? 2643 MOVD $1, TRUE 2644 VSPLTISB $0, ZER 2645 VOR RL, RH, T1H 2646 VCMPEQUDCC ZER, T1H, T1H 2647 2648 // 24 = CR6 NE 2649 ISEL $26, R0, TRUE, RES1 2650 VXOR RL, PL, T1L 2651 VXOR RH, PH, T1H // SAVE: T1L 2652 VOR T1L, T1H, T1H 2653 VCMPEQUDCC ZER, T1H, T1H 2654 2655 // 26 = CR6 NE 2656 ISEL $26, R0, TRUE, RES2 2657 OR RES2, RES1, RES1 2658 MOVD ret+24(FP), RES2 2659 AND RES2, RES1, RES1 2660 MOVD RES1, ret+24(FP) 2661 2662 // X=H ; Y=H ; MUL; T- // T1 = H*H 2663 VOR HL, HL, X0 2664 VOR HH, HH, X1 2665 VOR HL, HL, Y0 2666 VOR HH, HH, Y1 2667 CALL sm2p256MulInternal<>(SB) 2668 2669 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2670 VOR T0, T0, Y0 2671 VOR T1, T1, Y1 2672 CALL sm2p256MulInternal<>(SB) 2673 VOR T0, T0, T2L 2674 VOR T1, T1, T2H 2675 2676 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2677 VOR U1L, U1L, X0 2678 VOR U1H, U1H, X1 2679 CALL sm2p256MulInternal<>(SB) 2680 VOR T0, T0, U1L 2681 VOR T1, T1, U1H 2682 2683 // X=R ; Y=R ; MUL; T- // X3 = R*R 2684 VOR RL, RL, X0 2685 VOR RL, RL, Y0 2686 VOR RH, RH, X1 2687 VOR RH, RH, Y1 2688 CALL sm2p256MulInternal<>(SB) 2689 2690 // SUB(T<T-T2) // X3 = X3-T2 2691 p256SubInternal(T1,T0,T1,T0,T2H,T2L) 2692 2693 // ADD(X<U1+U1) // T1 = 2*U1 2694 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L) 2695 2696 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2697 p256SubInternal(T1,T0,T1,T0,X1,X0) 2698 MOVD res+0(FP), P3ptr 2699 XXPERMDI T1, T1, $2, TT1 2700 XXPERMDI T0, T0, $2, TT0 2701 STXVD2X TT0, (R0)(P3ptr) 2702 STXVD2X TT1, (R16)(P3ptr) 2703 2704 // SUB(Y<U1-T) // Y3 = U1-X3 2705 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0) 2706 2707 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2708 VOR RL, RL, X0 2709 2710 VOR RH, RH, X1 2711 CALL sm2p256MulInternal<>(SB) 2712 VOR T0, T0, U1L 2713 VOR T1, T1, U1H 2714 2715 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2716 VOR S1L, S1L, X0 2717 VOR S1H, S1H, X1 2718 VOR T2L, T2L, Y0 2719 VOR T2H, T2H, Y1 2720 CALL sm2p256MulInternal<>(SB) 2721 2722 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2723 p256SubInternal(T1,T0,U1H,U1L,T1,T0) 2724 MOVD res+0(FP), P3ptr 2725 XXPERMDI T1, T1, $2, TT1 2726 XXPERMDI T0, T0, $2, TT0 2727 STXVD2X TT0, (R17)(P3ptr) 2728 STXVD2X TT1, (R18)(P3ptr) 2729 2730 RET