github.com/emmansun/gmsm@v0.29.1/sm9/bn256/gfp_ppc64x.s (about) 1 // Copyright 2024 Sun Yimin. All rights reserved. 2 // Use of this source code is governed by a MIT-style 3 // license that can be found in the LICENSE file. 4 5 //go:build (ppc64 || ppc64le) && !purego 6 7 #include "textflag.h" 8 9 //func gfpUnmarshal(out *gfP, in *[32]byte) 10 TEXT ·gfpUnmarshal(SB), NOSPLIT, $0-16 11 MOVD res+0(FP), R3 12 MOVD in+8(FP), R4 13 BR gfpInternalEndianSwap<>(SB) 14 15 // func gfpMarshal(out *[32]byte, in *gfP) 16 TEXT ·gfpMarshal(SB), NOSPLIT, $0-16 17 MOVD res+0(FP), R3 18 MOVD in+8(FP), R4 19 BR gfpInternalEndianSwap<>(SB) 20 21 TEXT gfpInternalEndianSwap<>(SB), NOSPLIT, $0-0 22 // Index registers needed for BR movs 23 #ifdef GOARCH_ppc64le 24 MOVD $8, R9 25 MOVD $16, R10 26 MOVD $24, R14 27 28 MOVDBR (R0)(R4), R5 29 MOVDBR (R9)(R4), R6 30 MOVDBR (R10)(R4), R7 31 MOVDBR (R14)(R4), R8 32 33 MOVD R8, 0(R3) 34 MOVD R7, 8(R3) 35 MOVD R6, 16(R3) 36 MOVD R5, 24(R3) 37 #else 38 MOVD $16, R10 39 LXVD2X (R4)(R0), V0 40 LXVD2X (R4)(R10), V1 41 42 XXPERMDI V0, V0, $2, V0 43 XXPERMDI V1, V1, $2, V1 44 45 STXVD2X V1, (R0+R3) 46 STXVD2X V0, (R10+R3) 47 #endif 48 RET 49 50 #define X1L V0 51 #define X1H V1 52 #define Y1L V2 53 #define Y1H V3 54 #define T1L V4 55 #define T1H V5 56 #define T0 V4 57 #define T1 V5 58 #define T2 V6 59 #define SEL1 V7 60 #define ZERO V8 61 #define CAR1 V9 62 #define CAR2 V10 63 #define TT0 V11 64 #define TT1 V12 65 66 #define PL V30 67 #define PH V31 68 69 #define gfpSubInternal(T1, T0, X1, X0, Y1, Y0) \ 70 VSPLTISB $0, ZERO \ // VZERO 71 VSUBCUQ X0, Y0, CAR1 \ 72 VSUBUQM X0, Y0, T0 \ 73 VSUBECUQ X1, Y1, CAR1, SEL1 \ 74 VSUBEUQM X1, Y1, CAR1, T1 \ 75 VSUBUQM ZERO, SEL1, SEL1 \ // VSQ 76 \ 77 VADDCUQ T0, PL, CAR1 \ // VACCQ 78 VADDUQM T0, PL, TT0 \ // VAQ 79 VADDEUQM T1, PH, CAR1, TT1 \ // VACQ 80 \ 81 VSEL TT0, T0, SEL1, T0 \ 82 VSEL TT1, T1, SEL1, T1 \ 83 84 TEXT ·gfpNeg(SB),0,$0-16 85 MOVD c+0(FP), R3 86 MOVD a+8(FP), R4 87 88 MOVD $16, R5 89 LXVD2X (R4)(R0), Y1L 90 LXVD2X (R4)(R5), Y1H 91 92 XXPERMDI Y1H, Y1H, $2, Y1H 93 XXPERMDI Y1L, Y1L, $2, Y1L 94 95 MOVD $·p2+0(SB), R6 96 LXVD2X (R6)(R0), PL 97 LXVD2X (R6)(R5), PH 98 99 XXPERMDI PH, PH, $2, PH 100 XXPERMDI PL, PL, $2, PL 101 102 VSPLTISB $0, X1L 103 gfpSubInternal(T1, T0, X1L, X1L, Y1H, Y1L) 104 105 XXPERMDI T1, T1, $2, T1 106 XXPERMDI T0, T0, $2, T0 107 108 STXVD2X T0, (R0+R3) 109 STXVD2X T1, (R5+R3) 110 RET 111 112 TEXT ·gfpSub(SB),0,$0-24 113 MOVD c+0(FP), R3 114 MOVD a+8(FP), R4 115 MOVD b+16(FP), R5 116 117 MOVD $16, R6 118 LXVD2X (R4)(R0), X1L 119 LXVD2X (R4)(R6), X1H 120 XXPERMDI X1H, X1H, $2, X1H 121 XXPERMDI X1L, X1L, $2, X1L 122 123 LXVD2X (R5)(R0), Y1L 124 LXVD2X (R5)(R6), Y1H 125 XXPERMDI Y1H, Y1H, $2, Y1H 126 XXPERMDI Y1L, Y1L, $2, Y1L 127 128 MOVD $·p2+0(SB), R7 129 LXVD2X (R7)(R0), PL 130 LXVD2X (R7)(R6), PH 131 XXPERMDI PH, PH, $2, PH 132 XXPERMDI PL, PL, $2, PL 133 134 gfpSubInternal(T1, T0, X1H, X1L, Y1H, Y1L) 135 136 XXPERMDI T1, T1, $2, T1 137 XXPERMDI T0, T0, $2, T0 138 139 STXVD2X T0, (R0+R3) 140 STXVD2X T1, (R6+R3) 141 RET 142 143 #define gfpAddInternal(T1, T0, X1, X0, Y1, Y0) \ 144 VADDCUQ X0, Y0, CAR1 \ 145 VADDUQM X0, Y0, T0 \ 146 VADDECUQ X1, Y1, CAR1, T2 \ // VACCCQ 147 VADDEUQM X1, Y1, CAR1, T1 \ 148 \ 149 VSUBCUQ T0, PL, CAR1 \ // VSCBIQ 150 VSUBUQM T0, PL, TT0 \ 151 VSUBECUQ T1, PH, CAR1, CAR2 \ // VSBCBIQ 152 VSUBEUQM T1, PH, CAR1, TT1 \ // VSBIQ 153 VSUBEUQM T2, ZERO, CAR2, SEL1 \ 154 \ 155 VSEL TT0, T0, SEL1, T0 \ 156 VSEL TT1, T1, SEL1, T1 157 158 TEXT ·gfpAdd(SB),0,$0-24 159 MOVD c+0(FP), R3 160 MOVD a+8(FP), R4 161 MOVD b+16(FP), R5 162 163 MOVD $16, R6 164 LXVD2X (R4)(R0), X1L 165 LXVD2X (R4)(R6), X1H 166 XXPERMDI X1H, X1H, $2, X1H 167 XXPERMDI X1L, X1L, $2, X1L 168 169 LXVD2X (R5)(R0), Y1L 170 LXVD2X (R5)(R6), Y1H 171 XXPERMDI Y1H, Y1H, $2, Y1H 172 XXPERMDI Y1L, Y1L, $2, Y1L 173 174 MOVD $·p2+0(SB), R7 175 LXVD2X (R7)(R0), PL 176 LXVD2X (R7)(R6), PH 177 XXPERMDI PH, PH, $2, PH 178 XXPERMDI PL, PL, $2, PL 179 180 VSPLTISB $0, ZERO 181 182 gfpAddInternal(T1, T0, X1H, X1L, Y1H, Y1L) 183 184 XXPERMDI T1, T1, $2, T1 185 XXPERMDI T0, T0, $2, T0 186 187 STXVD2X T0, (R0+R3) 188 STXVD2X T1, (R6+R3) 189 RET 190 191 TEXT ·gfpDouble(SB),0,$0-16 192 MOVD c+0(FP), R3 193 MOVD a+8(FP), R4 194 195 MOVD $16, R6 196 LXVD2X (R4)(R0), X1L 197 LXVD2X (R4)(R6), X1H 198 XXPERMDI X1H, X1H, $2, X1H 199 XXPERMDI X1L, X1L, $2, X1L 200 201 MOVD $·p2+0(SB), R7 202 LXVD2X (R7)(R0), PL 203 LXVD2X (R7)(R6), PH 204 XXPERMDI PH, PH, $2, PH 205 XXPERMDI PL, PL, $2, PL 206 207 VSPLTISB $0, ZERO 208 209 gfpAddInternal(T1, T0, X1H, X1L, X1H, X1L) 210 211 XXPERMDI T1, T1, $2, T1 212 XXPERMDI T0, T0, $2, T0 213 214 STXVD2X T0, (R0+R3) 215 STXVD2X T1, (R6+R3) 216 RET 217 218 TEXT ·gfpTriple(SB),0,$0-16 219 MOVD c+0(FP), R3 220 MOVD a+8(FP), R4 221 222 MOVD $16, R6 223 LXVD2X (R4)(R0), X1L 224 LXVD2X (R4)(R6), X1H 225 XXPERMDI X1H, X1H, $2, X1H 226 XXPERMDI X1L, X1L, $2, X1L 227 228 MOVD $·p2+0(SB), R7 229 LXVD2X (R7)(R0), PL 230 LXVD2X (R7)(R6), PH 231 XXPERMDI PH, PH, $2, PH 232 XXPERMDI PL, PL, $2, PL 233 234 VSPLTISB $0, ZERO 235 236 gfpAddInternal(T1, T0, X1H, X1L, X1H, X1L) 237 gfpAddInternal(T1, T0, T1, T0, X1H, X1L) 238 239 XXPERMDI T1, T1, $2, T1 240 XXPERMDI T0, T0, $2, T0 241 242 STXVD2X T0, (R0+R3) 243 STXVD2X T1, (R6+R3) 244 RET 245 246 #undef X1L 247 #undef X1H 248 #undef Y1L 249 #undef Y1H 250 #undef T1L 251 #undef T1H 252 #undef T0 253 #undef T1 254 #undef T2 255 #undef SEL1 256 #undef ZERO 257 #undef CAR1 258 #undef CAR2 259 #undef TT0 260 #undef TT1 261 #undef PL 262 #undef PH 263 264 // Vector multiply word 265 // 266 // VMLF x0, x1, out_low 267 // VMLHF x0, x1, out_hi 268 #define VMULT(x1, x2, out_low, out_hi) \ 269 VMULEUW x1, x2, TMP1; \ 270 VMULOUW x1, x2, TMP2; \ 271 VMRGEW TMP1, TMP2, out_hi; \ 272 VMRGOW TMP1, TMP2, out_low 273 274 // 275 // Vector multiply add word 276 // 277 // VMALF x0, x1, y, out_low 278 // VMALHF x0, x1, y, out_hi 279 #define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \ 280 VMULEUW y, one, TMP1; \ 281 VMULOUW y, one, TMP2; \ 282 VMULEUW x1, x2, out_hi; \ 283 VMULOUW x1, x2, out_low; \ 284 VADDUDM TMP1, out_hi, TMP1; \ 285 VADDUDM TMP2, out_low, TMP2; \ 286 VMRGEW TMP1, TMP2, out_hi; \ 287 VMRGOW TMP1, TMP2, out_low 288 289 290 // --------------------------------------- 291 // gfpMulInternal 292 #define X0 V0 293 #define X1 V1 294 #define Y0 V2 295 #define Y1 V3 296 #define M1 V4 297 #define M0 V5 298 #define T0 V6 299 #define T1 V7 300 #define T2 V8 301 #define YDIG V9 302 303 #define ADD1 V16 304 #define ADD1H V17 305 #define ADD2 V18 306 #define ADD2H V19 307 #define RED1 V20 308 #define RED1H V21 309 #define RED2 V22 310 #define RED2H V23 311 #define CAR1 V24 312 #define CAR1M V25 313 314 #define MK0 V30 315 #define K0 V31 316 317 // TMP1, TMP2 used in 318 // VMULT macros 319 #define TMP1 V13 320 #define TMP2 V27 321 #define ONE V29 // 1s splatted by word 322 323 TEXT gfpMulInternal<>(SB), NOSPLIT, $0 324 // ---------------------------------------------------------------------------/ 325 // VREPF $3, Y0, YDIG 326 VSPLTW $3, Y0, YDIG 327 VSPLTISW $1, ONE 328 329 // VMLF X0, YDIG, ADD1 330 // VMLF X1, YDIG, ADD2 331 // VMLHF X0, YDIG, ADD1H 332 // VMLHF X1, YDIG, ADD2H 333 VMULT(X0, YDIG, ADD1, ADD1H) 334 VMULT(X1, YDIG, ADD2, ADD2H) 335 336 // VMLF ADD1, K0, MK0 337 // VREPF $3, MK0, MK0 338 VMULUWM ADD1, K0, MK0 339 VSPLTW $3, MK0, MK0 340 341 // VMALF M0, MK0, ADD1, RED1 342 // VMALHF M0, MK0, ADD1, RED1H 343 // VMALF M1, MK0, ADD2, RED2 344 // VMALHF M1, MK0, ADD2, RED2H 345 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 346 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 347 348 VSPLTISB $0, T2 // VZERO T2 349 350 VSLDOI $12, RED2, RED1, RED1 // VSLDB 351 VSLDOI $12, T2, RED2, RED2 // VSLDB 352 353 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 354 VADDUQM RED1, ADD1H, T0 // VAQ 355 VADDCUQ RED1H, T0, CAR1M // VACCQ 356 VADDUQM RED1H, T0, T0 // VAQ 357 358 // << ready for next MK0 359 360 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 361 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 362 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 363 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 364 VADDUQM CAR1, T2, T2 // VAQ 365 366 // --------------------------------------------------- 367 /* * 368 * ---+--------+--------+ 369 * T2| T1 | T0 | 370 * ---+--------+--------+ 371 * *(add)* 372 * +--------+--------+ 373 * | X1 | X0 | 374 * +--------+--------+ 375 * *(mul)* 376 * +--------+--------+ 377 * | YDIG | YDIG | 378 * +--------+--------+ 379 * *(add)* 380 * +--------+--------+ 381 * | M1 | M0 | 382 * +--------+--------+ 383 * *(mul)* 384 * +--------+--------+ 385 * | MK0 | MK0 | 386 * +--------+--------+ 387 * 388 * --------------------- 389 * 390 * +--------+--------+ 391 * | ADD2 | ADD1 | 392 * +--------+--------+ 393 * +--------+--------+ 394 * | ADD2H | ADD1H | 395 * +--------+--------+ 396 * +--------+--------+ 397 * | RED2 | RED1 | 398 * +--------+--------+ 399 * +--------+--------+ 400 * | RED2H | RED1H | 401 * +--------+--------+ 402 */ 403 // VREPF $2, Y0, YDIG 404 VSPLTW $2, Y0, YDIG 405 406 // VMALF X0, YDIG, T0, ADD1 407 // VMALF X1, YDIG, T1, ADD2 408 // VMALHF X0, YDIG, T0, ADD1H 409 // VMALHF X1, YDIG, T1, ADD2H 410 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 411 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 412 413 // VMLF ADD1, K0, MK0 414 // VREPF $3, MK0, MK0 415 VMULUWM ADD1, K0, MK0 416 VSPLTW $3, MK0, MK0 417 418 // VMALF M0, MK0, ADD1, RED1 419 // VMALHF M0, MK0, ADD1, RED1H 420 // VMALF M1, MK0, ADD2, RED2 421 // VMALHF M1, MK0, ADD2, RED2H 422 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 423 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 424 425 VSLDOI $12, RED2, RED1, RED1 // VSLDB 426 VSLDOI $12, T2, RED2, RED2 // VSLDB 427 428 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 429 VADDUQM RED1, ADD1H, T0 // VAQ 430 VADDCUQ RED1H, T0, CAR1M // VACCQ 431 VADDUQM RED1H, T0, T0 // VAQ 432 433 // << ready for next MK0 434 435 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 436 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 437 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 438 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 439 VADDUQM CAR1, T2, T2 // VAQ 440 441 // --------------------------------------------------- 442 // VREPF $1, Y0, YDIG 443 VSPLTW $1, Y0, YDIG 444 445 // VMALF X0, YDIG, T0, ADD1 446 // VMALF X1, YDIG, T1, ADD2 447 // VMALHF X0, YDIG, T0, ADD1H 448 // VMALHF X1, YDIG, T1, ADD2H 449 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 450 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 451 452 // VMLF ADD1, K0, MK0 453 // VREPF $3, MK0, MK0 454 VMULUWM ADD1, K0, MK0 455 VSPLTW $3, MK0, MK0 456 457 // VMALF M0, MK0, ADD1, RED1 458 // VMALHF M0, MK0, ADD1, RED1H 459 // VMALF M1, MK0, ADD2, RED2 460 // VMALHF M1, MK0, ADD2, RED2H 461 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 462 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 463 464 VSLDOI $12, RED2, RED1, RED1 // VSLDB 465 VSLDOI $12, T2, RED2, RED2 // VSLDB 466 467 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 468 VADDUQM RED1, ADD1H, T0 // VAQ 469 VADDCUQ RED1H, T0, CAR1M // VACCQ 470 VADDUQM RED1H, T0, T0 // VAQ 471 472 // << ready for next MK0 473 474 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 475 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 476 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 477 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 478 VADDUQM CAR1, T2, T2 // VAQ 479 480 // --------------------------------------------------- 481 // VREPF $0, Y0, YDIG 482 VSPLTW $0, Y0, YDIG 483 484 // VMALF X0, YDIG, T0, ADD1 485 // VMALF X1, YDIG, T1, ADD2 486 // VMALHF X0, YDIG, T0, ADD1H 487 // VMALHF X1, YDIG, T1, ADD2H 488 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 489 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 490 491 // VMLF ADD1, K0, MK0 492 // VREPF $3, MK0, MK0 493 VMULUWM ADD1, K0, MK0 494 VSPLTW $3, MK0, MK0 495 496 // VMALF M0, MK0, ADD1, RED1 497 // VMALHF M0, MK0, ADD1, RED1H 498 // VMALF M1, MK0, ADD2, RED2 499 // VMALHF M1, MK0, ADD2, RED2H 500 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 501 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 502 503 VSLDOI $12, RED2, RED1, RED1 504 VSLDOI $12, T2, RED2, RED2 505 506 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 507 VADDUQM RED1, ADD1H, T0 // VAQ 508 VADDCUQ RED1H, T0, CAR1M // VACCQ 509 VADDUQM RED1H, T0, T0 // VAQ 510 511 // << ready for next MK0 512 513 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 514 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 515 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 516 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 517 VADDUQM CAR1, T2, T2 // VAQ 518 519 // --------------------------------------------------- 520 // VREPF $3, Y1, YDIG 521 VSPLTW $3, Y1, YDIG 522 523 // VMALF X0, YDIG, T0, ADD1 524 // VMALF X1, YDIG, T1, ADD2 525 // VMALHF X0, YDIG, T0, ADD1H 526 // VMALHF X1, YDIG, T1, ADD2H 527 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 528 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 529 530 // VMLF ADD1, K0, MK0 531 // VREPF $3, MK0, MK0 532 VMULUWM ADD1, K0, MK0 533 VSPLTW $3, MK0, MK0 534 535 // VMALF M0, MK0, ADD1, RED1 536 // VMALHF M0, MK0, ADD1, RED1H 537 // VMALF M1, MK0, ADD2, RED2 538 // VMALHF M1, MK0, ADD2, RED2H 539 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 540 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 541 542 VSLDOI $12, RED2, RED1, RED1 543 VSLDOI $12, T2, RED2, RED2 544 545 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 546 VADDUQM RED1, ADD1H, T0 // VAQ 547 VADDCUQ RED1H, T0, CAR1M // VACCQ 548 VADDUQM RED1H, T0, T0 // VAQ 549 550 // << ready for next MK0 551 552 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 553 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 554 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 555 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 556 VADDUQM CAR1, T2, T2 // VAQ 557 558 // --------------------------------------------------- 559 // VREPF $2, Y1, YDIG 560 VSPLTW $2, Y1, YDIG 561 562 // VMALF X0, YDIG, T0, ADD1 563 // VMALF X1, YDIG, T1, ADD2 564 // VMALHF X0, YDIG, T0, ADD1H 565 // VMALHF X1, YDIG, T1, ADD2H 566 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 567 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 568 569 // VMLF ADD1, K0, MK0 570 // VREPF $3, MK0, MK0 571 VMULUWM ADD1, K0, MK0 572 VSPLTW $3, MK0, MK0 573 574 // VMALF M0, MK0, ADD1, RED1 575 // VMALHF M0, MK0, ADD1, RED1H 576 // VMALF M1, MK0, ADD2, RED2 577 // VMALHF M1, MK0, ADD2, RED2H 578 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 579 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 580 581 VSLDOI $12, RED2, RED1, RED1 582 VSLDOI $12, T2, RED2, RED2 583 584 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 585 VADDUQM RED1, ADD1H, T0 // VAQ 586 VADDCUQ RED1H, T0, CAR1M // VACCQ 587 VADDUQM RED1H, T0, T0 // VAQ 588 589 // << ready for next MK0 590 591 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 592 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 593 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 594 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 595 VADDUQM CAR1, T2, T2 // VAQ 596 597 // --------------------------------------------------- 598 // VREPF $1, Y1, YDIG 599 VSPLTW $1, Y1, YDIG 600 601 // VMALF X0, YDIG, T0, ADD1 602 // VMALF X1, YDIG, T1, ADD2 603 // VMALHF X0, YDIG, T0, ADD1H 604 // VMALHF X1, YDIG, T1, ADD2H 605 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 606 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 607 608 // VMLF ADD1, K0, MK0 609 // VREPF $3, MK0, MK0 610 VMULUWM ADD1, K0, MK0 611 VSPLTW $3, MK0, MK0 612 613 // VMALF M0, MK0, ADD1, RED1 614 // VMALHF M0, MK0, ADD1, RED1H 615 // VMALF M1, MK0, ADD2, RED2 616 // VMALHF M1, MK0, ADD2, RED2H 617 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 618 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 619 620 VSLDOI $12, RED2, RED1, RED1 621 VSLDOI $12, T2, RED2, RED2 622 623 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 624 VADDUQM RED1, ADD1H, T0 // VAQ 625 VADDCUQ RED1H, T0, CAR1M // VACCQ 626 VADDUQM RED1H, T0, T0 // VAQ 627 628 // << ready for next MK0 629 630 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 631 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 632 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 633 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 634 VADDUQM CAR1, T2, T2 // VAQ 635 636 // --------------------------------------------------- 637 // VREPF $0, Y1, YDIG 638 VSPLTW $0, Y1, YDIG 639 640 // VMALF X0, YDIG, T0, ADD1 641 // VMALF X1, YDIG, T1, ADD2 642 // VMALHF X0, YDIG, T0, ADD1H 643 // VMALHF X1, YDIG, T1, ADD2H 644 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H) 645 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H) 646 647 // VMLF ADD1, K0, MK0 648 // VREPF $3, MK0, MK0 649 VMULUWM ADD1, K0, MK0 650 VSPLTW $3, MK0, MK0 651 652 // VMALF M0, MK0, ADD1, RED1 653 // VMALHF M0, MK0, ADD1, RED1H 654 // VMALF M1, MK0, ADD2, RED2 655 // VMALHF M1, MK0, ADD2, RED2H 656 VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H) 657 VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H) 658 659 VSLDOI $12, RED2, RED1, RED1 660 VSLDOI $12, T2, RED2, RED2 661 662 VADDCUQ RED1, ADD1H, CAR1 // VACCQ 663 VADDUQM RED1, ADD1H, T0 // VAQ 664 VADDCUQ RED1H, T0, CAR1M // VACCQ 665 VADDUQM RED1H, T0, T0 // VAQ 666 667 // << ready for next MK0 668 669 VADDEUQM RED2, ADD2H, CAR1, T1 // VACQ 670 VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ 671 VADDECUQ RED2H, T1, CAR1M, T2 // VACCCQ 672 VADDEUQM RED2H, T1, CAR1M, T1 // VACQ 673 VADDUQM CAR1, T2, T2 // VAQ 674 675 // --------------------------------------------------- 676 677 // VZERO RED1 678 // VSCBIQ M0, T0, CAR1 679 // VSQ M0, T0, ADD1 680 // VSBCBIQ T1, M1, CAR1, CAR1M 681 // VSBIQ T1, M1, CAR1, ADD2 682 // VSBIQ T2, RED1, CAR1M, T2 683 VSPLTISB $0, RED1 // VZERO RED1 684 VSUBCUQ T0, M0, CAR1 // VSCBIQ 685 VSUBUQM T0, M0, ADD1 // VSQ 686 VSUBECUQ T1, M1, CAR1, CAR1M // VSBCBIQ 687 VSUBEUQM T1, M1, CAR1, ADD2 // VSBIQ 688 VSUBEUQM T2, RED1, CAR1M, T2 // VSBIQ 689 690 // what output to use, ADD2||ADD1 or T1||T0? 691 VSEL ADD1, T0, T2, T0 692 VSEL ADD2, T1, T2, T1 693 RET 694 695 #undef X0 696 #undef X1 697 #undef Y0 698 #undef Y1 699 #undef M0 700 #undef M1 701 #undef T0 702 #undef T1 703 #undef T2 704 #undef YDIG 705 706 #undef ADD1 707 #undef ADD1H 708 #undef ADD2 709 #undef ADD2H 710 #undef RED1 711 #undef RED1H 712 #undef RED2 713 #undef RED2H 714 #undef CAR1 715 #undef CAR1M 716 717 #undef MK0 718 #undef K0 719 #undef TMP1 720 #undef TMP2 721 #undef ONE 722 723 // func gfpMul(c, a, b *gfP) 724 #define res_ptr R3 725 #define x_ptr R4 726 #define y_ptr R5 727 #define CPOOL R7 728 #define N R8 729 730 #define X0 V0 731 #define X1 V1 732 #define Y0 V2 733 #define Y1 V3 734 #define M0 V5 735 #define M1 V4 736 #define T0 V6 737 #define T1 V7 738 #define K0 V31 739 740 TEXT ·gfpMul(SB),NOSPLIT,$0 741 MOVD c+0(FP), res_ptr 742 MOVD a+8(FP), x_ptr 743 MOVD b+16(FP), y_ptr 744 745 MOVD $16, R16 746 747 LXVD2X (R0)(x_ptr), X0 748 LXVD2X (R16)(x_ptr), X1 749 750 XXPERMDI X0, X0, $2, X0 751 XXPERMDI X1, X1, $2, X1 752 753 LXVD2X (R0)(y_ptr), Y0 754 LXVD2X (R16)(y_ptr), Y1 755 756 XXPERMDI Y0, Y0, $2, Y0 757 XXPERMDI Y1, Y1, $2, Y1 758 759 MOVD $·p2+0(SB), CPOOL 760 LXVD2X (CPOOL)(R0), M0 761 LXVD2X (CPOOL)(R16), M1 762 763 XXPERMDI M0, M0, $2, M0 764 XXPERMDI M1, M1, $2, M1 765 766 MOVD $·np+0(SB), CPOOL 767 LXVD2X (CPOOL)(R0), K0 768 VSPLTW $1, K0, K0 769 770 CALL gfpMulInternal<>(SB) 771 772 XXPERMDI T0, T0, $2, T0 773 XXPERMDI T1, T1, $2, T1 774 STXVD2X T0, (R0)(res_ptr) 775 STXVD2X T1, (R16)(res_ptr) 776 777 RET 778 779 // func gfpSqr(res, in *gfP, n int) 780 TEXT ·gfpSqr(SB),NOSPLIT,$0 781 MOVD res+0(FP), res_ptr 782 MOVD in+8(FP), x_ptr 783 MOVD n+16(FP), N 784 MOVD $16, R16 785 786 LXVD2X (R0)(x_ptr), X0 787 LXVD2X (R16)(x_ptr), X1 788 789 XXPERMDI X0, X0, $2, X0 790 XXPERMDI X1, X1, $2, X1 791 792 MOVD $·p2+0(SB), CPOOL 793 LXVD2X (CPOOL)(R0), M0 794 LXVD2X (CPOOL)(R16), M1 795 796 XXPERMDI M0, M0, $2, M0 797 XXPERMDI M1, M1, $2, M1 798 799 MOVD $·np+0(SB), CPOOL 800 LXVD2X (CPOOL)(R0), K0 801 VSPLTW $1, K0, K0 802 803 sqrLoop: 804 // Sqr uses same value for both 805 806 VOR X0, X0, Y0 807 VOR X1, X1, Y1 808 CALL gfpMulInternal<>(SB) 809 810 ADD $-1, N 811 CMP $0, N 812 BEQ done 813 814 VOR T0, T0, X0 815 VOR T1, T1, X1 816 BR sqrLoop 817 818 done: 819 XXPERMDI T0, T0, $2, T0 820 XXPERMDI T1, T1, $2, T1 821 STXVD2X T0, (R0)(res_ptr) 822 STXVD2X T1, (R16)(res_ptr) 823 RET 824 825 #undef res_ptr 826 #undef x_ptr 827 #undef y_ptr 828 #undef CPOOL 829 #undef N 830 #undef X0 831 #undef X1 832 #undef Y0 833 #undef Y1 834 #undef M0 835 #undef M1 836 #undef T0 837 #undef T1 838 #undef K0 839 840 /* ---------------------------------------*/ 841 #define res_ptr R3 842 #define x_ptr R4 843 #define CPOOL R7 844 845 #define M0 V5 846 #define M1 V4 847 #define T0 V6 848 #define T1 V7 849 #define T2 V8 850 851 #define ADD1 V16 852 #define ADD1H V17 853 #define ADD2 V18 854 #define ADD2H V19 855 #define RED1 V20 856 #define RED1H V21 857 #define RED2 V22 858 #define RED2H V23 859 #define CAR1 V24 860 #define CAR1M V25 861 862 #define MK0 V30 863 #define K0 V31 864 865 // TMP1, TMP2 used in 866 // VMULT macros 867 #define TMP1 V13 868 #define TMP2 V27 869 #define ONE V29 // 1s splatted by word 870 // func gfpFromMont(res, in *gfP) 871 TEXT ·gfpFromMont(SB),NOSPLIT,$0 872 MOVD res+0(FP), res_ptr 873 MOVD in+8(FP), x_ptr 874 875 MOVD $16, R16 876 877 LXVD2X (R0)(x_ptr), T0 878 LXVD2X (R16)(x_ptr), T1 879 880 XXPERMDI T0, T0, $2, T0 881 XXPERMDI T1, T1, $2, T1 882 883 MOVD $·p2+0(SB), CPOOL 884 LXVD2X (CPOOL)(R0), M0 885 LXVD2X (CPOOL)(R16), M1 886 887 XXPERMDI M0, M0, $2, M0 888 XXPERMDI M1, M1, $2, M1 889 890 MOVD $·np+0(SB), CPOOL 891 LXVD2X (CPOOL)(R0), K0 892 VSPLTW $1, K0, K0 893 894 // ---------------------------------------------------------------------------/ 895 VSPLTISW $1, ONE 896 VSPLTISB $0, T2 // VZERO T2 897 898 MOVD $8, R5 899 MOVD R5, CTR 900 901 loop: 902 VMULUWM T0, K0, MK0 903 VSPLTW $3, MK0, MK0 904 905 VMULT_ADD(M0, MK0, T0, ONE, RED1, RED1H) 906 VMULT_ADD(M1, MK0, T1, ONE, RED2, RED2H) 907 908 VSLDOI $12, RED2, RED1, RED1 // VSLDB 909 VSLDOI $12, T2, RED2, RED2 // VSLDB 910 911 VADDCUQ RED1H, RED1, CAR1M // VACCQ 912 VADDUQM RED1H, RED1, T0 // VAQ 913 914 // << ready for next MK0 915 916 VADDECUQ RED2H, RED2, CAR1M, T2 // VACCCQ 917 VADDEUQM RED2H, RED2, CAR1M, T1 // VACQ 918 919 BDNZ loop 920 // --------------------------------------------------- 921 VSPLTISB $0, RED1 // VZERO RED1 922 VSUBCUQ T0, M0, CAR1 // VSCBIQ 923 VSUBUQM T0, M0, ADD1 // VSQ 924 VSUBECUQ T1, M1, CAR1, CAR1M // VSBCBIQ 925 VSUBEUQM T1, M1, CAR1, ADD2 // VSBIQ 926 VSUBEUQM T2, RED1, CAR1M, T2 // VSBIQ 927 928 // what output to use, ADD2||ADD1 or T1||T0? 929 VSEL ADD1, T0, T2, T0 930 VSEL ADD2, T1, T2, T1 931 932 XXPERMDI T0, T0, $2, T0 933 XXPERMDI T1, T1, $2, T1 934 STXVD2X T0, (R0)(res_ptr) 935 STXVD2X T1, (R16)(res_ptr) 936 RET