github.com/AESNooper/go/src@v0.0.0-20220218095104-b56a4ab1bbbb/crypto/elliptic/p256_asm_ppc64le.s (about) 1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // This is a port of the s390x asm implementation. 8 // to ppc64le. 9 10 // Some changes were needed due to differences in 11 // the Go opcodes and/or available instructions 12 // between s390x and ppc64le. 13 14 // 1. There were operand order differences in the 15 // VSUBUQM, VSUBCUQ, and VSEL instructions. 16 17 // 2. ppc64 does not have a multiply high and low 18 // like s390x, so those were implemented using 19 // macros to compute the equivalent values. 20 21 // 3. The LVX, STVX instructions on ppc64 require 22 // 16 byte alignment of the data. To avoid that 23 // requirement, data is loaded using LXVD2X and 24 // STXVD2X with VPERM to reorder bytes correctly. 25 26 // I have identified some areas where I believe 27 // changes would be needed to make this work for big 28 // endian; however additional changes beyond what I 29 // have noted are most likely needed to make it work. 30 // - The string used with VPERM to swap the byte order 31 // for loads and stores. 32 // - The EXTRACT_HI and EXTRACT_LO strings. 33 // - The constants that are loaded from CPOOL. 34 // 35 36 // Permute string used by VPERM to reorder bytes 37 // loaded or stored using LXVD2X or STXVD2X 38 // on little endian. 39 DATA byteswap<>+0(SB)/8, $0x08090a0b0c0d0e0f 40 DATA byteswap<>+8(SB)/8, $0x0001020304050607 41 42 // The following constants are defined in an order 43 // that is correct for use with LXVD2X/STXVD2X 44 // on little endian. 45 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256 46 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256 47 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256 48 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256 49 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 50 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 51 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0 52 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0 53 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 54 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 55 DATA p256mul<>+0x00(SB)/8, $0x00000000ffffffff // P256 original 56 DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256 57 DATA p256mul<>+0x10(SB)/8, $0xffffffff00000001 // P256 original 58 DATA p256mul<>+0x18(SB)/8, $0x0000000000000000 // P256 59 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0 60 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0 61 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0 62 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0 63 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1 64 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1 65 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0 66 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 67 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 68 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 69 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0 70 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0 71 DATA p256mul<>+0x80(SB)/8, $0xffffffff00000000 // (1*2^256)%P256 72 DATA p256mul<>+0x88(SB)/8, $0x0000000000000001 // (1*2^256)%P256 73 DATA p256mul<>+0x90(SB)/8, $0x00000000fffffffe // (1*2^256)%P256 74 DATA p256mul<>+0x98(SB)/8, $0xffffffffffffffff // (1*2^256)%P256 75 76 // The following are used with VPERM to extract the high and low 77 // values from the intermediate results of a vector multiply. 78 // They are used in the VMULTxxx macros. These have been tested 79 // only on little endian, I think they would have to be different 80 // for big endian. 81 DATA p256permhilo<>+0x00(SB)/8, $0x0405060714151617 // least significant 82 DATA p256permhilo<>+0x08(SB)/8, $0x0c0d0e0f1c1d1e1f 83 DATA p256permhilo<>+0x10(SB)/8, $0x0001020310111213 // most significant 84 DATA p256permhilo<>+0x18(SB)/8, $0x08090a0b18191A1B 85 86 // External declarations for constants 87 GLOBL p256ord<>(SB), 8, $32 88 GLOBL p256<>(SB), 8, $80 89 GLOBL p256mul<>(SB), 8, $160 90 GLOBL p256permhilo<>(SB), 8, $32 91 GLOBL byteswap<>+0(SB), RODATA, $16 92 93 // The following macros are used to implement the ppc64le 94 // equivalent function from the corresponding s390x 95 // instruction for vector multiply high, low, and add, 96 // since there aren't exact equivalent instructions. 97 // The corresponding s390x instructions appear in the 98 // comments. 99 // Implementation for big endian would have to be 100 // investigated, I think it would be different. 101 // 102 // Vector multiply low word 103 // 104 // VMLF x0, x1, out_low 105 #define VMULT_LOW(x1, x2, out_low) \ 106 VMULUWM x1, x2, out_low 107 108 // 109 // Vector multiply high word 110 // 111 // VMLHF x0, x1, out_hi 112 #define VMULT_HI(x1, x2, out_hi) \ 113 VMULEUW x1, x2, TMP1; \ 114 VMULOUW x1, x2, TMP2; \ 115 VPERM TMP1, TMP2, EXTRACT_HI, out_hi 116 117 // 118 // Vector multiply word 119 // 120 // VMLF x0, x1, out_low 121 // VMLHF x0, x1, out_hi 122 #define VMULT(x1, x2, out_low, out_hi) \ 123 VMULEUW x1, x2, TMP1; \ 124 VMULOUW x1, x2, TMP2; \ 125 VPERM TMP1, TMP2, EXTRACT_LO, out_low; \ 126 VPERM TMP1, TMP2, EXTRACT_HI, out_hi 127 128 // 129 // Vector multiply add word 130 // 131 // VMALF x0, x1, y, out_low 132 // VMALHF x0, x1, y, out_hi 133 #define VMULT_ADD(x1, x2, y, out_low, out_hi) \ 134 VSPLTISW $1, TMP1; \ 135 VMULEUW y, TMP1, TMP2; \ 136 VMULOUW y, TMP1, TMP1; \ 137 VMULEUW x1, x2, out_low; \ 138 VMULOUW x1, x2, out_hi; \ 139 VADDUDM TMP1, out_hi, TMP1; \ 140 VADDUDM TMP2, out_low, TMP2; \ 141 VPERM TMP2, TMP1, EXTRACT_LO, out_low; \ 142 VPERM TMP2, TMP1, EXTRACT_HI, out_hi 143 144 // 145 // Vector multiply add high word 146 // 147 // VMALF x0, x1, y, out_low 148 // VMALHF x0, x1, y, out_hi 149 #define VMULT_ADD_HI(x1, x2, y, out_low, out_hi) \ 150 VSPLTISW $1, TMP1; \ 151 VMULOUW y, TMP1, TMP2; \ 152 VMULEUW y, TMP1, TMP1; \ 153 VMULEUW x1, x2, out_hi; \ 154 VMULOUW x1, x2, out_low; \ 155 VADDUDM TMP1, out_hi, TMP1; \ 156 VADDUDM TMP2, out_low, TMP2; \ 157 VPERM TMP2, TMP1, EXTRACT_HI, out_hi 158 159 // 160 // Vector multiply add low word 161 // 162 // VMALF s0, x1, y, out_low 163 #define VMULT_ADD_LOW(x1, x2, y, out_low) \ 164 VMULUWM x1, x2, out_low; \ 165 VADDUWM out_low, y, out_low 166 167 #define res_ptr R3 168 #define a_ptr R4 169 170 #undef res_ptr 171 #undef a_ptr 172 173 // func p256NegCond(val *p256Point, cond int) 174 #define P1ptr R3 175 #define CPOOL R7 176 177 #define Y1L V0 178 #define Y1L_ VS32 179 #define Y1H V1 180 #define Y1H_ VS33 181 #define T1L V2 182 #define T1L_ VS34 183 #define T1H V3 184 #define T1H_ VS35 185 186 #define SWAP V28 187 #define SWAP_ VS60 188 189 #define PL V30 190 #define PL_ VS62 191 #define PH V31 192 #define PH_ VS63 193 194 #define SEL1 V5 195 #define SEL1_ VS37 196 #define CAR1 V6 197 // 198 // iff cond == 1 val <- -val 199 // 200 TEXT ·p256NegCond(SB), NOSPLIT, $0-16 201 MOVD val+0(FP), P1ptr 202 MOVD $16, R16 203 MOVD $32, R17 204 MOVD $48, R18 205 MOVD $40, R19 206 207 MOVD cond+8(FP), R6 208 CMP $0, R6 209 BC 12, 2, LR // just return if cond == 0 210 211 MOVD $p256mul<>+0x00(SB), CPOOL 212 213 MOVD $byteswap<>+0x00(SB), R8 214 LXVD2X (R8)(R0), SWAP_ 215 216 LXVD2X (P1ptr)(R17), Y1L_ 217 LXVD2X (P1ptr)(R18), Y1H_ 218 219 VPERM Y1H, Y1H, SWAP, Y1H 220 VPERM Y1L, Y1L, SWAP, Y1L 221 222 LXVD2X (CPOOL)(R0), PL_ 223 LXVD2X (CPOOL)(R16), PH_ 224 225 VSUBCUQ PL, Y1L, CAR1 // subtract part2 giving carry 226 VSUBUQM PL, Y1L, T1L // subtract part2 giving result 227 VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2 228 229 VPERM T1H, T1H, SWAP, T1H 230 VPERM T1L, T1L, SWAP, T1L 231 232 STXVD2X T1L_, (R17+P1ptr) 233 STXVD2X T1H_, (R18+P1ptr) 234 RET 235 236 #undef P1ptr 237 #undef CPOOL 238 #undef Y1L 239 #undef Y1L_ 240 #undef Y1H 241 #undef Y1H_ 242 #undef T1L 243 #undef T1L_ 244 #undef T1H 245 #undef T1H_ 246 #undef PL 247 #undef PL_ 248 #undef PH 249 #undef PH_ 250 #undef SEL1 251 #undef SEL1_ 252 #undef CAR1 253 254 // 255 // if cond == 0 res <-b else res <-a 256 // 257 // func p256MovCond(res, a, b *p256Point, cond int) 258 #define P3ptr R3 259 #define P1ptr R4 260 #define P2ptr R5 261 262 #define FROMptr R7 263 #define X1L V0 264 #define X1H V1 265 #define Y1L V2 266 #define Y1H V3 267 #define Z1L V4 268 #define Z1H V5 269 #define X1L_ VS32 270 #define X1H_ VS33 271 #define Y1L_ VS34 272 #define Y1H_ VS35 273 #define Z1L_ VS36 274 #define Z1H_ VS37 275 276 // This function uses LXVD2X and STXVD2X to avoid the 277 // data alignment requirement for LVX, STVX. Since 278 // this code is just moving bytes and not doing arithmetic, 279 // order of the bytes doesn't matter. 280 // 281 TEXT ·p256MovCond(SB), NOSPLIT, $0-32 282 MOVD res+0(FP), P3ptr 283 MOVD a+8(FP), P1ptr 284 MOVD b+16(FP), P2ptr 285 MOVD cond+24(FP), R6 286 MOVD $16, R16 287 MOVD $32, R17 288 MOVD $48, R18 289 MOVD $56, R21 290 MOVD $64, R19 291 MOVD $80, R20 292 293 // Check the condition 294 CMP $0, R6 295 296 // If 0, use b as the source 297 BEQ FROMB 298 299 // Not 0, use a as the source 300 MOVD P1ptr, FROMptr 301 BR LOADVALS 302 303 FROMB: 304 MOVD P2ptr, FROMptr 305 306 LOADVALS: 307 // Load from a or b depending on the setting 308 // of FROMptr 309 LXVW4X (FROMptr+R0), X1H_ 310 LXVW4X (FROMptr+R16), X1L_ 311 LXVW4X (FROMptr+R17), Y1H_ 312 LXVW4X (FROMptr+R18), Y1L_ 313 LXVW4X (FROMptr+R19), Z1H_ 314 LXVW4X (FROMptr+R20), Z1L_ 315 316 STXVW4X X1H_, (P3ptr+R0) 317 STXVW4X X1L_, (P3ptr+R16) 318 STXVW4X Y1H_, (P3ptr+R17) 319 STXVW4X Y1L_, (P3ptr+R18) 320 STXVW4X Z1H_, (P3ptr+R19) 321 STXVW4X Z1L_, (P3ptr+R20) 322 323 RET 324 325 #undef P3ptr 326 #undef P1ptr 327 #undef P2ptr 328 #undef FROMptr 329 #undef X1L 330 #undef X1H 331 #undef Y1L 332 #undef Y1H 333 #undef Z1L 334 #undef Z1H 335 #undef X1L_ 336 #undef X1H_ 337 #undef Y1L_ 338 #undef Y1H_ 339 #undef Z1L_ 340 #undef Z1H_ 341 // 342 // Select the point from the table for idx 343 // 344 // func p256Select(point *p256Point, table []p256Point, idx int) 345 #define P3ptr R3 346 #define P1ptr R4 347 #define COUNT R5 348 349 #define X1L V0 350 #define X1H V1 351 #define Y1L V2 352 #define Y1H V3 353 #define Z1L V4 354 #define Z1H V5 355 #define X1L_ VS32 356 #define X1H_ VS33 357 #define Y1L_ VS34 358 #define Y1H_ VS35 359 #define Z1L_ VS36 360 #define Z1H_ VS37 361 #define X2L V6 362 #define X2H V7 363 #define Y2L V8 364 #define Y2H V9 365 #define Z2L V10 366 #define Z2H V11 367 #define X2L_ VS38 368 #define X2H_ VS39 369 #define Y2L_ VS40 370 #define Y2H_ VS41 371 #define Z2L_ VS42 372 #define Z2H_ VS43 373 374 #define ONE V18 375 #define IDX V19 376 #define SEL1 V20 377 #define SEL1_ VS52 378 #define SEL2 V21 379 // 380 TEXT ·p256Select(SB), NOSPLIT, $0-40 381 MOVD point+0(FP), P3ptr 382 MOVD table+8(FP), P1ptr 383 MOVD $16, R16 384 MOVD $32, R17 385 MOVD $48, R18 386 MOVD $64, R19 387 MOVD $80, R20 388 389 LXVDSX (R1)(R19), SEL1_ // VLREPG idx+32(FP), SEL1 390 VSPLTB $7, SEL1, IDX // splat byte 391 VSPLTISB $1, ONE // VREPIB $1, ONE 392 VSPLTISB $1, SEL2 // VREPIB $1, SEL2 393 MOVD $17, COUNT 394 MOVD COUNT, CTR // set up ctr 395 396 VSPLTISB $0, X1H // VZERO X1H 397 VSPLTISB $0, X1L // VZERO X1L 398 VSPLTISB $0, Y1H // VZERO Y1H 399 VSPLTISB $0, Y1L // VZERO Y1L 400 VSPLTISB $0, Z1H // VZERO Z1H 401 VSPLTISB $0, Z1L // VZERO Z1L 402 403 loop_select: 404 405 // LVXD2X is used here since data alignment doesn't 406 // matter. 407 408 LXVD2X (P1ptr+R0), X2H_ 409 LXVD2X (P1ptr+R16), X2L_ 410 LXVD2X (P1ptr+R17), Y2H_ 411 LXVD2X (P1ptr+R18), Y2L_ 412 LXVD2X (P1ptr+R19), Z2H_ 413 LXVD2X (P1ptr+R20), Z2L_ 414 415 VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK 416 417 // This will result in SEL1 being all 0s or 1s, meaning 418 // the result is either X1L or X2L, no individual byte 419 // selection. 420 421 VSEL X1L, X2L, SEL1, X1L 422 VSEL X1H, X2H, SEL1, X1H 423 VSEL Y1L, Y2L, SEL1, Y1L 424 VSEL Y1H, Y2H, SEL1, Y1H 425 VSEL Z1L, Z2L, SEL1, Z1L 426 VSEL Z1H, Z2H, SEL1, Z1H 427 428 // Add 1 to all bytes in SEL2 429 VADDUBM SEL2, ONE, SEL2 // VAB SEL2, ONE, SEL2 OK 430 ADD $96, P1ptr 431 BC 16, 0, loop_select 432 433 // STXVD2X is used here so that alignment doesn't 434 // need to be verified. Since values were loaded 435 // using LXVD2X this is OK. 436 STXVD2X X1H_, (P3ptr+R0) 437 STXVD2X X1L_, (P3ptr+R16) 438 STXVD2X Y1H_, (P3ptr+R17) 439 STXVD2X Y1L_, (P3ptr+R18) 440 STXVD2X Z1H_, (P3ptr+R19) 441 STXVD2X Z1L_, (P3ptr+R20) 442 RET 443 444 #undef P3ptr 445 #undef P1ptr 446 #undef COUNT 447 #undef X1L 448 #undef X1H 449 #undef Y1L 450 #undef Y1H 451 #undef Z1L 452 #undef Z1H 453 #undef X2L 454 #undef X2H 455 #undef Y2L 456 #undef Y2H 457 #undef Z2L 458 #undef Z2H 459 #undef X2L_ 460 #undef X2H_ 461 #undef Y2L_ 462 #undef Y2H_ 463 #undef Z2L_ 464 #undef Z2H_ 465 #undef ONE 466 #undef IDX 467 #undef SEL1 468 #undef SEL1_ 469 #undef SEL2 470 471 // func p256SelectBase(point, table []uint64, idx int) 472 #define P3ptr R3 473 #define P1ptr R4 474 #define COUNT R5 475 476 #define X1L V0 477 #define X1H V1 478 #define Y1L V2 479 #define Y1H V3 480 #define Z1L V4 481 #define Z1H V5 482 #define X2L V6 483 #define X2H V7 484 #define Y2L V8 485 #define Y2H V9 486 #define Z2L V10 487 #define Z2H V11 488 #define X2L_ VS38 489 #define X2H_ VS39 490 #define Y2L_ VS40 491 #define Y2H_ VS41 492 #define Z2L_ VS42 493 #define Z2H_ VS43 494 495 #define ONE V18 496 #define IDX V19 497 #define SEL1 V20 498 #define SEL1_ VS52 499 #define SEL2 V21 500 TEXT ·p256SelectBase(SB), NOSPLIT, $0-40 501 MOVD point+0(FP), P3ptr 502 MOVD table+8(FP), P1ptr 503 MOVD $16, R16 504 MOVD $32, R17 505 MOVD $48, R18 506 MOVD $64, R19 507 MOVD $80, R20 508 MOVD $56, R21 509 510 LXVDSX (R1)(R19), SEL1_ 511 VSPLTB $7, SEL1, IDX // splat byte 512 513 VSPLTISB $1, ONE // Vector with byte 1s 514 VSPLTISB $1, SEL2 // Vector with byte 1s 515 MOVD $65, COUNT 516 MOVD COUNT, CTR // loop count 517 518 VSPLTISB $0, X1H // VZERO X1H 519 VSPLTISB $0, X1L // VZERO X1L 520 VSPLTISB $0, Y1H // VZERO Y1H 521 VSPLTISB $0, Y1L // VZERO Y1L 522 VSPLTISB $0, Z1H // VZERO Z1H 523 VSPLTISB $0, Z1L // VZERO Z1L 524 525 loop_select: 526 LXVD2X (P1ptr+R0), X2H_ 527 LXVD2X (P1ptr+R16), X2L_ 528 LXVD2X (P1ptr+R17), Y2H_ 529 LXVD2X (P1ptr+R18), Y2L_ 530 LXVD2X (P1ptr+R19), Z2H_ 531 LXVD2X (P1ptr+R20), Z2L_ 532 533 VCMPEQUD SEL2, IDX, SEL1 // Compare against idx 534 535 VSEL X1L, X2L, SEL1, X1L // Select if idx matched 536 VSEL X1H, X2H, SEL1, X1H 537 VSEL Y1L, Y2L, SEL1, Y1L 538 VSEL Y1H, Y2H, SEL1, Y1H 539 VSEL Z1L, Z2L, SEL1, Z1L 540 VSEL Z1H, Z2H, SEL1, Z1H 541 542 VADDUBM SEL2, ONE, SEL2 // Increment SEL2 bytes by 1 543 ADD $96, P1ptr // Next chunk 544 BC 16, 0, loop_select 545 546 STXVD2X X1H_, (P3ptr+R0) 547 STXVD2X X1L_, (P3ptr+R16) 548 STXVD2X Y1H_, (P3ptr+R17) 549 STXVD2X Y1L_, (P3ptr+R18) 550 STXVD2X Z1H_, (P3ptr+R19) 551 STXVD2X Z1L_, (P3ptr+R20) 552 RET 553 554 #undef P3ptr 555 #undef P1ptr 556 #undef COUNT 557 #undef X1L 558 #undef X1H 559 #undef Y1L 560 #undef Y1H 561 #undef Z1L 562 #undef Z1H 563 #undef X2L 564 #undef X2H 565 #undef Y2L 566 #undef Y2H 567 #undef Z2L 568 #undef Z2H 569 #undef X1L_ 570 #undef X1H_ 571 #undef X2L_ 572 #undef X2H_ 573 #undef Y1L_ 574 #undef Y1H_ 575 #undef Y2L_ 576 #undef Y2H_ 577 #undef Z1L_ 578 #undef Z1H_ 579 #undef Z2L_ 580 #undef Z2H_ 581 #undef ONE 582 #undef IDX 583 #undef SEL1 584 #undef SEL1_ 585 #undef SEL2 586 #undef SWAP 587 #undef SWAP_ 588 589 // --------------------------------------- 590 // func p256FromMont(res, in []byte) 591 #define res_ptr R3 592 #define x_ptr R4 593 #define CPOOL R7 594 595 #define T0 V0 596 #define T0_ VS32 597 #define T1 V1 598 #define T1_ VS33 599 #define T2 V2 600 #define TT0 V3 601 #define TT1 V4 602 #define TT0_ VS35 603 #define TT1_ VS36 604 605 #define ZER V6 606 #define SEL1 V7 607 #define SEL1_ VS39 608 #define SEL2 V8 609 #define SEL2_ VS40 610 #define CAR1 V9 611 #define CAR2 V10 612 #define RED1 V11 613 #define RED2 V12 614 #define PL V13 615 #define PL_ VS45 616 #define PH V14 617 #define PH_ VS46 618 #define SWAP V28 619 #define SWAP_ VS57 620 621 TEXT ·p256FromMont(SB), NOSPLIT, $0-48 622 MOVD res+0(FP), res_ptr 623 MOVD in+24(FP), x_ptr 624 625 MOVD $16, R16 626 MOVD $32, R17 627 MOVD $48, R18 628 MOVD $64, R19 629 MOVD $p256<>+0x00(SB), CPOOL 630 MOVD $byteswap<>+0x00(SB), R15 631 632 VSPLTISB $0, T2 // VZERO T2 633 VSPLTISB $0, ZER // VZERO ZER 634 635 // Constants are defined so that the LXVD2X is correct 636 LXVD2X (CPOOL+R0), PH_ 637 LXVD2X (CPOOL+R16), PL_ 638 639 // VPERM byte selections 640 LXVD2X (CPOOL+R18), SEL2_ 641 LXVD2X (CPOOL+R19), SEL1_ 642 643 LXVD2X (R15)(R0), SWAP_ 644 645 LXVD2X (R16)(x_ptr), T1_ 646 LXVD2X (R0)(x_ptr), T0_ 647 648 // Put in true little endian order 649 VPERM T0, T0, SWAP, T0 650 VPERM T1, T1, SWAP, T1 651 652 // First round 653 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 654 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 655 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow 656 657 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 658 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 659 660 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 661 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 662 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 663 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 664 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 665 666 // Second round 667 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 668 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 669 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow 670 671 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 672 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 673 674 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 675 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 676 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 677 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 678 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 679 680 // Third round 681 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 682 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 683 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow 684 685 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 686 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 687 688 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 689 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 690 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 691 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 692 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 693 694 // Last round 695 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 696 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 697 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow 698 699 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0 700 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1 701 702 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1 703 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0 704 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2 705 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1 706 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2 707 708 // --------------------------------------------------- 709 710 VSUBCUQ T0, PL, CAR1 // VSCBIQ PL, T0, CAR1 711 VSUBUQM T0, PL, TT0 // VSQ PL, T0, TT0 712 VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2 713 VSUBEUQM T1, PH, CAR1, TT1 // VSBIQ T1, PH, CAR1, TT1 714 VSUBEUQM T2, ZER, CAR2, T2 // VSBIQ T2, ZER, CAR2, T2 715 716 VSEL TT0, T0, T2, T0 717 VSEL TT1, T1, T2, T1 718 719 // Reorder the bytes so STXVD2X can be used. 720 // TT0, TT1 used for VPERM result in case 721 // the caller expects T0, T1 to be good. 722 VPERM T0, T0, SWAP, TT0 723 VPERM T1, T1, SWAP, TT1 724 725 STXVD2X TT0_, (R0)(res_ptr) 726 STXVD2X TT1_, (R16)(res_ptr) 727 RET 728 729 #undef res_ptr 730 #undef x_ptr 731 #undef CPOOL 732 #undef T0 733 #undef T0_ 734 #undef T1 735 #undef T1_ 736 #undef T2 737 #undef TT0 738 #undef TT1 739 #undef ZER 740 #undef SEL1 741 #undef SEL1_ 742 #undef SEL2 743 #undef SEL2_ 744 #undef CAR1 745 #undef CAR2 746 #undef RED1 747 #undef RED2 748 #undef PL 749 #undef PL_ 750 #undef PH 751 #undef PH_ 752 #undef SWAP 753 #undef SWAP_ 754 755 // --------------------------------------- 756 // p256MulInternal 757 // V0-V3 V30,V31 - Not Modified 758 // V4-V15 V27-V29 - Volatile 759 760 #define CPOOL R7 761 762 // Parameters 763 #define X0 V0 // Not modified 764 #define X1 V1 // Not modified 765 #define Y0 V2 // Not modified 766 #define Y1 V3 // Not modified 767 #define T0 V4 // Result 768 #define T1 V5 // Result 769 #define P0 V30 // Not modified 770 #define P1 V31 // Not modified 771 772 // Temporaries: lots of reused vector regs 773 #define YDIG V6 // Overloaded with CAR2 774 #define ADD1H V7 // Overloaded with ADD3H 775 #define ADD2H V8 // Overloaded with ADD4H 776 #define ADD3 V9 // Overloaded with SEL2,SEL5 777 #define ADD4 V10 // Overloaded with SEL3,SEL6 778 #define RED1 V11 // Overloaded with CAR2 779 #define RED2 V12 780 #define RED3 V13 // Overloaded with SEL1 781 #define T2 V14 782 // Overloaded temporaries 783 #define ADD1 V4 // Overloaded with T0 784 #define ADD2 V5 // Overloaded with T1 785 #define ADD3H V7 // Overloaded with ADD1H 786 #define ADD4H V8 // Overloaded with ADD2H 787 #define ZER V28 // Overloaded with TMP1 788 #define CAR1 V6 // Overloaded with YDIG 789 #define CAR2 V11 // Overloaded with RED1 790 // Constant Selects 791 #define SEL1 V13 // Overloaded with RED3 792 #define SEL2 V9 // Overloaded with ADD3,SEL5 793 #define SEL3 V10 // Overloaded with ADD4,SEL6 794 #define SEL4 V6 // Overloaded with YDIG,CAR1 795 #define SEL5 V9 // Overloaded with ADD3,SEL2 796 #define SEL6 V10 // Overloaded with ADD4,SEL3 797 #define SEL1_ VS45 798 #define SEL2_ VS41 799 #define SEL3_ VS42 800 #define SEL4_ VS38 801 #define SEL5_ VS41 802 #define SEL6_ VS42 803 804 // TMP1, TMP2, EXTRACT_LO, EXTRACT_HI used in 805 // VMULT macros 806 #define TMP1 V13 // Overloaded with RED3 807 #define TMP2 V27 808 #define EVENODD R5 809 #define EXTRACT_LO V28 810 #define EXTRACT_LO_ VS60 811 #define EXTRACT_HI V29 812 #define EXTRACT_HI_ VS61 813 814 /* * 815 * To follow the flow of bits, for your own sanity a stiff drink, need you shall. 816 * Of a single round, a 'helpful' picture, here is. Meaning, column position has. 817 * With you, SIMD be... 818 * 819 * +--------+--------+ 820 * +--------| RED2 | RED1 | 821 * | +--------+--------+ 822 * | ---+--------+--------+ 823 * | +---- T2| T1 | T0 |--+ 824 * | | ---+--------+--------+ | 825 * | | | 826 * | | ======================= | 827 * | | | 828 * | | +--------+--------+<-+ 829 * | +-------| ADD2 | ADD1 |--|-----+ 830 * | | +--------+--------+ | | 831 * | | +--------+--------+<---+ | 832 * | | | ADD2H | ADD1H |--+ | 833 * | | +--------+--------+ | | 834 * | | +--------+--------+<-+ | 835 * | | | ADD4 | ADD3 |--|-+ | 836 * | | +--------+--------+ | | | 837 * | | +--------+--------+<---+ | | 838 * | | | ADD4H | ADD3H |------|-+ |(+vzero) 839 * | | +--------+--------+ | | V 840 * | | ------------------------ | | +--------+ 841 * | | | | | RED3 | [d0 0 0 d0] 842 * | | | | +--------+ 843 * | +---->+--------+--------+ | | | 844 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | | 845 * | +--------+--------+ | | | 846 * +---->---+--------+--------+ | | | 847 * T2| T1 | T0 |----+ | | 848 * ---+--------+--------+ | | | 849 * ---+--------+--------+<---+ | | 850 * +--- T2| T1 | T0 |----------+ 851 * | ---+--------+--------+ | | 852 * | +--------+--------+<-------------+ 853 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] 854 * | +--------+--------+ | | | 855 * | +--------+<----------------------+ 856 * | | RED3 |--------------+ | [0 0 d1 d0] 857 * | +--------+ | | 858 * +--->+--------+--------+ | | 859 * | T1 | T0 |--------+ 860 * +--------+--------+ | | 861 * --------------------------- | | 862 * | | 863 * +--------+--------+<----+ | 864 * | RED2 | RED1 | | 865 * +--------+--------+ | 866 * ---+--------+--------+<-------+ 867 * T2| T1 | T0 | (H1P-H1P-H00RRAY!) 868 * ---+--------+--------+ 869 * 870 * *Mi obra de arte de siglo XXI @vpaprots 871 * 872 * 873 * First group is special, doesn't get the two inputs: 874 * +--------+--------+<-+ 875 * +-------| ADD2 | ADD1 |--|-----+ 876 * | +--------+--------+ | | 877 * | +--------+--------+<---+ | 878 * | | ADD2H | ADD1H |--+ | 879 * | +--------+--------+ | | 880 * | +--------+--------+<-+ | 881 * | | ADD4 | ADD3 |--|-+ | 882 * | +--------+--------+ | | | 883 * | +--------+--------+<---+ | | 884 * | | ADD4H | ADD3H |------|-+ |(+vzero) 885 * | +--------+--------+ | | V 886 * | ------------------------ | | +--------+ 887 * | | | | RED3 | [d0 0 0 d0] 888 * | | | +--------+ 889 * +---->+--------+--------+ | | | 890 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | | 891 * +--------+--------+ | | | 892 * ---+--------+--------+<---+ | | 893 * +--- T2| T1 | T0 |----------+ 894 * | ---+--------+--------+ | | 895 * | +--------+--------+<-------------+ 896 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] 897 * | +--------+--------+ | | | 898 * | +--------+<----------------------+ 899 * | | RED3 |--------------+ | [0 0 d1 d0] 900 * | +--------+ | | 901 * +--->+--------+--------+ | | 902 * | T1 | T0 |--------+ 903 * +--------+--------+ | | 904 * --------------------------- | | 905 * | | 906 * +--------+--------+<----+ | 907 * | RED2 | RED1 | | 908 * +--------+--------+ | 909 * ---+--------+--------+<-------+ 910 * T2| T1 | T0 | (H1P-H1P-H00RRAY!) 911 * ---+--------+--------+ 912 * 913 * Last 'group' needs to RED2||RED1 shifted less 914 */ 915 TEXT p256MulInternal<>(SB), NOSPLIT, $0-16 916 // CPOOL loaded from caller 917 MOVD $16, R16 918 MOVD $32, R17 919 MOVD $48, R18 920 MOVD $64, R19 921 MOVD $80, R20 922 MOVD $96, R21 923 MOVD $112, R22 924 925 MOVD $p256permhilo<>+0x00(SB), EVENODD 926 927 // These values are used by the VMULTxxx macros to 928 // extract the high and low portions of the intermediate 929 // result. 930 LXVD2X (R0)(EVENODD), EXTRACT_LO_ 931 LXVD2X (R16)(EVENODD), EXTRACT_HI_ 932 933 // --------------------------------------------------- 934 935 VSPLTW $3, Y0, YDIG // VREPF Y0 is input 936 937 // VMLHF X0, YDIG, ADD1H 938 // VMLHF X1, YDIG, ADD2H 939 // VMLF X0, YDIG, ADD1 940 // VMLF X1, YDIG, ADD2 941 // 942 VMULT(X0, YDIG, ADD1, ADD1H) 943 VMULT(X1, YDIG, ADD2, ADD2H) 944 945 VSPLTW $2, Y0, YDIG // VREPF 946 947 // VMALF X0, YDIG, ADD1H, ADD3 948 // VMALF X1, YDIG, ADD2H, ADD4 949 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 950 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 951 VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H) 952 VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H) 953 954 LXVD2X (R17)(CPOOL), SEL1_ 955 VSPLTISB $0, ZER // VZERO ZER 956 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 957 958 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB 959 VSLDOI $12, ZER, ADD2, T1 // ADD2 Free // VSLDB 960 961 VADDCUQ T0, ADD3, CAR1 // VACCQ 962 VADDUQM T0, ADD3, T0 // ADD3 Free // VAQ 963 VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ 964 VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free // VACQ 965 966 LXVD2X (R18)(CPOOL), SEL2_ 967 LXVD2X (R19)(CPOOL), SEL3_ 968 LXVD2X (R20)(CPOOL), SEL4_ 969 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 970 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 971 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 972 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow -->? // VSQ 973 974 VSLDOI $12, T1, T0, T0 // VSLDB 975 VSLDOI $12, T2, T1, T1 // VSLDB 976 977 VADDCUQ T0, ADD3H, CAR1 // VACCQ 978 VADDUQM T0, ADD3H, T0 // VAQ 979 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 980 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 981 982 // --------------------------------------------------- 983 984 VSPLTW $1, Y0, YDIG // VREPF 985 LXVD2X (R0)(EVENODD), EXTRACT_LO_ 986 LXVD2X (R16)(EVENODD), EXTRACT_HI_ 987 988 // VMALHF X0, YDIG, T0, ADD1H 989 // VMALHF X1, YDIG, T1, ADD2H 990 // VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1 991 // VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2 992 VMULT_ADD(X0, YDIG, T0, ADD1, ADD1H) 993 VMULT_ADD(X1, YDIG, T1, ADD2, ADD2H) 994 995 VSPLTW $0, Y0, YDIG // VREPF 996 997 // VMALF X0, YDIG, ADD1H, ADD3 998 // VMALF X1, YDIG, ADD2H, ADD4 999 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H 1000 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER 1001 VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H) 1002 VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H) 1003 1004 VSPLTISB $0, ZER // VZERO ZER 1005 LXVD2X (R17)(CPOOL), SEL1_ 1006 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1007 1008 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0 // VSLDB 1009 VSLDOI $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free // VSLDB 1010 1011 VADDCUQ T0, RED1, CAR1 // VACCQ 1012 VADDUQM T0, RED1, T0 // VAQ 1013 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ 1014 VADDEUQM T1, RED2, CAR1, T1 // VACQ 1015 1016 VADDCUQ T0, ADD3, CAR1 // VACCQ 1017 VADDUQM T0, ADD3, T0 // VAQ 1018 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ 1019 VADDEUQM T1, ADD4, CAR1, T1 // VACQ 1020 VADDUQM T2, CAR2, T2 // VAQ 1021 1022 LXVD2X (R18)(CPOOL), SEL2_ 1023 LXVD2X (R19)(CPOOL), SEL3_ 1024 LXVD2X (R20)(CPOOL), SEL4_ 1025 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1026 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 1027 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 1028 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ 1029 1030 VSLDOI $12, T1, T0, T0 // VSLDB 1031 VSLDOI $12, T2, T1, T1 // VSLDB 1032 1033 VADDCUQ T0, ADD3H, CAR1 // VACCQ 1034 VADDUQM T0, ADD3H, T0 // VAQ 1035 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 1036 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 1037 1038 // --------------------------------------------------- 1039 1040 VSPLTW $3, Y1, YDIG // VREPF 1041 LXVD2X (R0)(EVENODD), EXTRACT_LO_ 1042 LXVD2X (R16)(EVENODD), EXTRACT_HI_ 1043 1044 // VMALHF X0, YDIG, T0, ADD1H 1045 // VMALHF X1, YDIG, T1, ADD2H 1046 // VMALF X0, YDIG, T0, ADD1 1047 // VMALF X1, YDIG, T1, ADD2 1048 VMULT_ADD(X0, YDIG, T0, ADD1, ADD1H) 1049 VMULT_ADD(X1, YDIG, T1, ADD2, ADD2H) 1050 1051 VSPLTW $2, Y1, YDIG // VREPF 1052 1053 // VMALF X0, YDIG, ADD1H, ADD3 1054 // VMALF X1, YDIG, ADD2H, ADD4 1055 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free 1056 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free 1057 VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H) 1058 VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H) 1059 1060 LXVD2X (R17)(CPOOL), SEL1_ 1061 VSPLTISB $0, ZER // VZERO ZER 1062 LXVD2X (R17)(CPOOL), SEL1_ 1063 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1064 1065 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB 1066 VSLDOI $12, T2, ADD2, T1 // ADD2 Free // VSLDB 1067 1068 VADDCUQ T0, RED1, CAR1 // VACCQ 1069 VADDUQM T0, RED1, T0 // VAQ 1070 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ 1071 VADDEUQM T1, RED2, CAR1, T1 // VACQ 1072 1073 VADDCUQ T0, ADD3, CAR1 // VACCQ 1074 VADDUQM T0, ADD3, T0 // VAQ 1075 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ 1076 VADDEUQM T1, ADD4, CAR1, T1 // VACQ 1077 VADDUQM T2, CAR2, T2 // VAQ 1078 1079 LXVD2X (R18)(CPOOL), SEL2_ 1080 LXVD2X (R19)(CPOOL), SEL3_ 1081 LXVD2X (R20)(CPOOL), SEL4_ 1082 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] 1083 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] 1084 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] 1085 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ 1086 1087 VSLDOI $12, T1, T0, T0 // VSLDB 1088 VSLDOI $12, T2, T1, T1 // VSLDB 1089 1090 VADDCUQ T0, ADD3H, CAR1 // VACCQ 1091 VADDUQM T0, ADD3H, T0 // VAQ 1092 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 1093 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 1094 1095 // --------------------------------------------------- 1096 1097 VSPLTW $1, Y1, YDIG // VREPF 1098 LXVD2X (R0)(EVENODD), EXTRACT_LO_ 1099 LXVD2X (R16)(EVENODD), EXTRACT_HI_ 1100 1101 // VMALHF X0, YDIG, T0, ADD1H 1102 // VMALHF X1, YDIG, T1, ADD2H 1103 // VMALF X0, YDIG, T0, ADD1 1104 // VMALF X1, YDIG, T1, ADD2 1105 VMULT_ADD(X0, YDIG, T0, ADD1, ADD1H) 1106 VMULT_ADD(X1, YDIG, T1, ADD2, ADD2H) 1107 1108 VSPLTW $0, Y1, YDIG // VREPF 1109 1110 // VMALF X0, YDIG, ADD1H, ADD3 1111 // VMALF X1, YDIG, ADD2H, ADD4 1112 // VMALHF X0, YDIG, ADD1H, ADD3H 1113 // VMALHF X1, YDIG, ADD2H, ADD4H 1114 VMULT_ADD(X0, YDIG, ADD1H, ADD3, ADD3H) 1115 VMULT_ADD(X1, YDIG, ADD2H, ADD4, ADD4H) 1116 1117 VSPLTISB $0, ZER // VZERO ZER 1118 LXVD2X (R17)(CPOOL), SEL1_ 1119 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] 1120 1121 VSLDOI $12, ADD2, ADD1, T0 // VSLDB 1122 VSLDOI $12, T2, ADD2, T1 // VSLDB 1123 1124 VADDCUQ T0, RED1, CAR1 // VACCQ 1125 VADDUQM T0, RED1, T0 // VAQ 1126 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ 1127 VADDEUQM T1, RED2, CAR1, T1 // VACQ 1128 1129 VADDCUQ T0, ADD3, CAR1 // VACCQ 1130 VADDUQM T0, ADD3, T0 // VAQ 1131 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ 1132 VADDEUQM T1, ADD4, CAR1, T1 // VACQ 1133 VADDUQM T2, CAR2, T2 // VAQ 1134 1135 LXVD2X (R21)(CPOOL), SEL5_ 1136 LXVD2X (R22)(CPOOL), SEL6_ 1137 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0] 1138 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0] 1139 VSUBUQM RED2, RED1, RED2 // Guaranteed not to underflow // VSQ 1140 1141 VSLDOI $12, T1, T0, T0 // VSLDB 1142 VSLDOI $12, T2, T1, T1 // VSLDB 1143 1144 VADDCUQ T0, ADD3H, CAR1 // VACCQ 1145 VADDUQM T0, ADD3H, T0 // VAQ 1146 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ 1147 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ 1148 1149 VADDCUQ T0, RED1, CAR1 // VACCQ 1150 VADDUQM T0, RED1, T0 // VAQ 1151 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ 1152 VADDEUQM T1, RED2, CAR1, T1 // VACQ 1153 VADDUQM T2, CAR2, T2 // VAQ 1154 1155 // --------------------------------------------------- 1156 1157 VSPLTISB $0, RED3 // VZERO RED3 1158 VSUBCUQ T0, P0, CAR1 // VSCBIQ 1159 VSUBUQM T0, P0, ADD1H // VSQ 1160 VSUBECUQ T1, P1, CAR1, CAR2 // VSBCBIQ 1161 VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ 1162 VSUBEUQM T2, RED3, CAR2, T2 // VSBIQ 1163 1164 // what output to use, ADD2H||ADD1H or T1||T0? 1165 VSEL ADD1H, T0, T2, T0 1166 VSEL ADD2H, T1, T2, T1 1167 RET 1168 1169 #undef CPOOL 1170 1171 #undef X0 1172 #undef X1 1173 #undef Y0 1174 #undef Y1 1175 #undef T0 1176 #undef T1 1177 #undef P0 1178 #undef P1 1179 1180 #undef SEL1 1181 #undef SEL2 1182 #undef SEL3 1183 #undef SEL4 1184 #undef SEL5 1185 #undef SEL6 1186 #undef SEL1_ 1187 #undef SEL2_ 1188 #undef SEL3_ 1189 #undef SEL4_ 1190 #undef SEL5_ 1191 #undef SEL6_ 1192 1193 #undef YDIG 1194 #undef ADD1H 1195 #undef ADD2H 1196 #undef ADD3 1197 #undef ADD4 1198 #undef RED1 1199 #undef RED2 1200 #undef RED3 1201 #undef T2 1202 #undef ADD1 1203 #undef ADD2 1204 #undef ADD3H 1205 #undef ADD4H 1206 #undef ZER 1207 #undef CAR1 1208 #undef CAR2 1209 1210 #undef TMP1 1211 #undef TMP2 1212 #undef EVENODD 1213 #undef EXTRACT_HI 1214 #undef EXTRACT_HI_ 1215 #undef EXTRACT_LO 1216 #undef EXTRACT_LO_ 1217 1218 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ 1219 VSPLTISB $0, ZER \ // VZERO 1220 VSUBCUQ X0, Y0, CAR1 \ 1221 VSUBUQM X0, Y0, T0 \ 1222 VSUBECUQ X1, Y1, CAR1, SEL1 \ 1223 VSUBEUQM X1, Y1, CAR1, T1 \ 1224 VSUBUQM ZER, SEL1, SEL1 \ // VSQ 1225 \ 1226 VADDCUQ T0, PL, CAR1 \ // VACCQ 1227 VADDUQM T0, PL, TT0 \ // VAQ 1228 VADDEUQM T1, PH, CAR1, TT1 \ // VACQ 1229 \ 1230 VSEL TT0, T0, SEL1, T0 \ 1231 VSEL TT1, T1, SEL1, T1 \ 1232 1233 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ 1234 VADDCUQ X0, Y0, CAR1 \ 1235 VADDUQM X0, Y0, T0 \ 1236 VADDECUQ X1, Y1, CAR1, T2 \ // VACCCQ 1237 VADDEUQM X1, Y1, CAR1, T1 \ 1238 \ 1239 VSPLTISB $0, ZER \ 1240 VSUBCUQ T0, PL, CAR1 \ // VSCBIQ 1241 VSUBUQM T0, PL, TT0 \ 1242 VSUBECUQ T1, PH, CAR1, CAR2 \ // VSBCBIQ 1243 VSUBEUQM T1, PH, CAR1, TT1 \ // VSBIQ 1244 VSUBEUQM T2, ZER, CAR2, SEL1 \ 1245 \ 1246 VSEL TT0, T0, SEL1, T0 \ 1247 VSEL TT1, T1, SEL1, T1 1248 1249 #define p256HalfInternal(T1, T0, X1, X0) \ 1250 VSPLTISB $0, ZER \ 1251 VSUBEUQM ZER, ZER, X0, SEL1 \ 1252 \ 1253 VADDCUQ X0, PL, CAR1 \ 1254 VADDUQM X0, PL, T0 \ 1255 VADDECUQ X1, PH, CAR1, T2 \ 1256 VADDEUQM X1, PH, CAR1, T1 \ 1257 \ 1258 VSEL T0, X0, SEL1, T0 \ 1259 VSEL T1, X1, SEL1, T1 \ 1260 VSEL T2, ZER, SEL1, T2 \ 1261 \ 1262 VSLDOI $15, T2, ZER, TT1 \ 1263 VSLDOI $15, T1, ZER, TT0 \ 1264 VSPLTISB $1, SEL1 \ 1265 VSR T0, SEL1, T0 \ // VSRL 1266 VSR T1, SEL1, T1 \ 1267 VSPLTISB $7, SEL1 \ // VREPIB 1268 VSL TT0, SEL1, TT0 \ 1269 VSL TT1, SEL1, TT1 \ 1270 VOR T0, TT0, T0 \ 1271 VOR T1, TT1, T1 1272 1273 // --------------------------------------- 1274 // func p256MulAsm(res, in1, in2 []byte) 1275 #define res_ptr R3 1276 #define x_ptr R4 1277 #define y_ptr R5 1278 #define CPOOL R7 1279 #define TEMP R8 1280 1281 // Parameters 1282 #define X0 V0 1283 #define X1 V1 1284 #define Y0 V2 1285 #define Y1 V3 1286 #define T0 V4 1287 #define T1 V5 1288 #define X0_ VS32 1289 #define X1_ VS33 1290 #define Y0_ VS34 1291 #define Y1_ VS35 1292 #define T0_ VS36 1293 #define T1_ VS37 1294 #define SWAP V28 1295 #define SWAP_ VS60 1296 1297 // Constants 1298 #define P0 V30 1299 #define P1 V31 1300 #define P0_ VS62 1301 #define P1_ VS63 1302 // 1303 // Montgomery multiplication modulo P256 1304 // 1305 TEXT ·p256MulAsm(SB), NOSPLIT, $0-72 1306 MOVD res+0(FP), res_ptr 1307 MOVD in1+24(FP), x_ptr 1308 MOVD in2+48(FP), y_ptr 1309 MOVD $16, R16 1310 MOVD $32, R17 1311 1312 MOVD $p256mul<>+0x00(SB), CPOOL 1313 MOVD $byteswap<>+0x00(SB), R8 1314 1315 LXVD2X (R8)(R0), SWAP_ 1316 1317 LXVD2X (R0)(x_ptr), X0_ 1318 LXVD2X (R16)(x_ptr), X1_ 1319 1320 VPERM X0, X0, SWAP, X0 1321 VPERM X1, X1, SWAP, X1 1322 1323 LXVD2X (R0)(y_ptr), Y0_ 1324 LXVD2X (R16)(y_ptr), Y1_ 1325 1326 VPERM Y0, Y0, SWAP, Y0 1327 VPERM Y1, Y1, SWAP, Y1 1328 1329 LXVD2X (R16)(CPOOL), P1_ 1330 LXVD2X (R0)(CPOOL), P0_ 1331 1332 CALL p256MulInternal<>(SB) 1333 1334 MOVD $p256mul<>+0x00(SB), CPOOL 1335 MOVD $byteswap<>+0x00(SB), R8 1336 1337 LXVD2X (R8)(R0), SWAP_ 1338 1339 VPERM T0, T0, SWAP, T0 1340 VPERM T1, T1, SWAP, T1 1341 STXVD2X T0_, (R0)(res_ptr) 1342 STXVD2X T1_, (R16)(res_ptr) 1343 RET 1344 1345 #undef res_ptr 1346 #undef x_ptr 1347 #undef y_ptr 1348 #undef CPOOL 1349 1350 #undef X0 1351 #undef X1 1352 #undef Y0 1353 #undef Y1 1354 #undef T0 1355 #undef T1 1356 #undef P0 1357 #undef P1 1358 #undef X0_ 1359 #undef X1_ 1360 #undef Y0_ 1361 #undef Y1_ 1362 #undef T0_ 1363 #undef T1_ 1364 #undef P0_ 1365 #undef P1_ 1366 1367 // Point add with P2 being affine point 1368 // If sign == 1 -> P2 = -P2 1369 // If sel == 0 -> P3 = P1 1370 // if zero == 0 -> P3 = P2 1371 // p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int) 1372 #define P3ptr R3 1373 #define P1ptr R4 1374 #define P2ptr R5 1375 #define CPOOL R7 1376 1377 // Temporaries in REGs 1378 #define Y2L V15 1379 #define Y2H V16 1380 #define Y2L_ VS47 1381 #define Y2H_ VS48 1382 #define T1L V17 1383 #define T1H V18 1384 #define T2L V19 1385 #define T2H V20 1386 #define T3L V21 1387 #define T3H V22 1388 #define T4L V23 1389 #define T4H V24 1390 1391 // Temps for Sub and Add 1392 #define TT0 V11 1393 #define TT1 V12 1394 #define T2 V13 1395 1396 // p256MulAsm Parameters 1397 #define X0 V0 1398 #define X1 V1 1399 #define X0_ VS32 1400 #define X1_ VS33 1401 #define Y0 V2 1402 #define Y1 V3 1403 #define Y0_ VS34 1404 #define Y1_ VS35 1405 #define T0 V4 1406 #define T1 V5 1407 1408 #define PL V30 1409 #define PH V31 1410 #define PL_ VS62 1411 #define PH_ VS63 1412 1413 // Names for zero/sel selects 1414 #define X1L V0 1415 #define X1H V1 1416 #define X1L_ VS32 1417 #define X1H_ VS33 1418 #define Y1L V2 // p256MulAsmParmY 1419 #define Y1H V3 // p256MulAsmParmY 1420 #define Y1L_ VS34 1421 #define Y1H_ VS35 1422 #define Z1L V4 1423 #define Z1H V5 1424 #define Z1L_ VS36 1425 #define Z1H_ VS37 1426 #define X2L V0 1427 #define X2H V1 1428 #define X2L_ VS32 1429 #define X2H_ VS33 1430 #define Z2L V4 1431 #define Z2H V5 1432 #define Z2L_ VS36 1433 #define Z2H_ VS37 1434 #define X3L V17 // T1L 1435 #define X3H V18 // T1H 1436 #define Y3L V21 // T3L 1437 #define Y3H V22 // T3H 1438 #define Z3L V25 1439 #define Z3H V26 1440 #define X3L_ VS49 1441 #define X3H_ VS50 1442 #define Y3L_ VS53 1443 #define Y3H_ VS54 1444 #define Z3L_ VS57 1445 #define Z3H_ VS58 1446 1447 #define ZER V6 1448 #define SEL1 V7 1449 #define SEL1_ VS39 1450 #define CAR1 V8 1451 #define CAR2 V9 1452 /* * 1453 * Three operand formula: 1454 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1455 * T1 = Z1² 1456 * T2 = T1*Z1 1457 * T1 = T1*X2 1458 * T2 = T2*Y2 1459 * T1 = T1-X1 1460 * T2 = T2-Y1 1461 * Z3 = Z1*T1 1462 * T3 = T1² 1463 * T4 = T3*T1 1464 * T3 = T3*X1 1465 * T1 = 2*T3 1466 * X3 = T2² 1467 * X3 = X3-T1 1468 * X3 = X3-T4 1469 * T3 = T3-X3 1470 * T3 = T3*T2 1471 * T4 = T4*Y1 1472 * Y3 = T3-T4 1473 1474 * Three operand formulas, but with MulInternal X,Y used to store temps 1475 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1 1476 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2 1477 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2 1478 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2 1479 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1480 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1481 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2 1482 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2 1483 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4 1484 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4 1485 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1486 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4 1487 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 1488 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1489 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1490 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4 1491 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4 1492 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 1493 1494 */ 1495 // 1496 // V27 is clobbered by p256MulInternal so must be 1497 // saved in a temp. 1498 // 1499 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $16-48 1500 MOVD res+0(FP), P3ptr 1501 MOVD in1+8(FP), P1ptr 1502 MOVD in2+16(FP), P2ptr 1503 1504 MOVD $p256mul<>+0x00(SB), CPOOL 1505 1506 MOVD $16, R16 1507 MOVD $32, R17 1508 MOVD $48, R18 1509 MOVD $64, R19 1510 MOVD $80, R20 1511 MOVD $96, R21 1512 MOVD $112, R22 1513 MOVD $128, R23 1514 MOVD $144, R24 1515 MOVD $160, R25 1516 MOVD $104, R26 // offset of sign+24(FP) 1517 1518 MOVD $byteswap<>+0+00(SB), R8 1519 LXVD2X (R16)(CPOOL), PH_ 1520 LXVD2X (R0)(CPOOL), PL_ 1521 1522 // if (sign == 1) { 1523 // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2 1524 // } 1525 1526 LXVD2X (R8)(R0), SWAP_ 1527 LXVD2X (R17)(P2ptr), Y2L_ 1528 LXVD2X (R18)(P2ptr), Y2H_ 1529 VPERM Y2H, Y2H, SWAP, Y2H 1530 VPERM Y2L, Y2L, SWAP, Y2L 1531 1532 // Equivalent of VLREPG sign+24(FP), SEL1 1533 LXVDSX (R1)(R26), SEL1_ 1534 VSPLTISB $0, ZER 1535 VCMPEQUD SEL1, ZER, SEL1 1536 1537 VSUBCUQ PL, Y2L, CAR1 1538 VSUBUQM PL, Y2L, T1L 1539 VSUBEUQM PH, Y2H, CAR1, T1H 1540 1541 VSEL T1L, Y2L, SEL1, Y2L 1542 VSEL T1H, Y2H, SEL1, Y2H 1543 1544 /* * 1545 * Three operand formula: 1546 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1547 */ 1548 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 1549 LXVD2X (R8)(R0), SWAP_ 1550 LXVD2X (R19)(P1ptr), X0_ // Z1H 1551 LXVD2X (R20)(P1ptr), X1_ // Z1L 1552 VPERM X0, X0, SWAP, X0 1553 VPERM X1, X1, SWAP, X1 1554 VOR X0, X0, Y0 1555 VOR X1, X1, Y1 1556 CALL p256MulInternal<>(SB) 1557 1558 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 1559 VOR T0, T0, X0 1560 VOR T1, T1, X1 1561 CALL p256MulInternal<>(SB) 1562 VOR T0, T0, T2L 1563 VOR T1, T1, T2H 1564 1565 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 1566 MOVD in2+16(FP), P2ptr 1567 LXVD2X (R8)(R0), SWAP_ 1568 LXVD2X (R0)(P2ptr), Y0_ // X2H 1569 LXVD2X (R16)(P2ptr), Y1_ // X2L 1570 VPERM Y0, Y0, SWAP, Y0 1571 VPERM Y1, Y1, SWAP, Y1 1572 CALL p256MulInternal<>(SB) 1573 VOR T0, T0, T1L 1574 VOR T1, T1, T1H 1575 1576 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2 1577 VOR T2L, T2L, X0 1578 VOR T2H, T2H, X1 1579 VOR Y2L, Y2L, Y0 1580 VOR Y2H, Y2H, Y1 1581 CALL p256MulInternal<>(SB) 1582 1583 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 1584 MOVD in1+8(FP), P1ptr 1585 LXVD2X (R8)(R0), SWAP_ 1586 LXVD2X (R17)(P1ptr), Y1L_ 1587 LXVD2X (R18)(P1ptr), Y1H_ 1588 VPERM Y1H, Y1H, SWAP, Y1H 1589 VPERM Y1L, Y1L, SWAP, Y1L 1590 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L) 1591 1592 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 1593 LXVD2X (R0)(P1ptr), X1L_ 1594 LXVD2X (R16)(P1ptr), X1H_ 1595 VPERM X1H, X1H, SWAP, X1H 1596 VPERM X1L, X1L, SWAP, X1L 1597 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L) 1598 1599 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2 1600 LXVD2X (R19)(P1ptr), X0_ // Z1H 1601 LXVD2X (R20)(P1ptr), X1_ // Z1L 1602 VPERM X0, X0, SWAP, X0 1603 VPERM X1, X1, SWAP, X1 1604 CALL p256MulInternal<>(SB) 1605 1606 VOR T0, T0, Z3L 1607 VOR T1, T1, Z3H 1608 1609 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 1610 VOR Y0, Y0, X0 1611 VOR Y1, Y1, X1 1612 CALL p256MulInternal<>(SB) 1613 VOR T0, T0, X0 1614 VOR T1, T1, X1 1615 1616 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4 1617 CALL p256MulInternal<>(SB) 1618 VOR T0, T0, T4L 1619 VOR T1, T1, T4H 1620 1621 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 1622 MOVD in1+8(FP), P1ptr 1623 LXVD2X (R8)(R0), SWAP_ 1624 LXVD2X (R0)(P1ptr), Y0_ // X1H 1625 LXVD2X (R16)(P1ptr), Y1_ // X1L 1626 VPERM Y1, Y1, SWAP, Y1 1627 VPERM Y0, Y0, SWAP, Y0 1628 CALL p256MulInternal<>(SB) 1629 VOR T0, T0, T3L 1630 VOR T1, T1, T3H 1631 1632 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 1633 p256AddInternal(T1H,T1L, T1,T0,T1,T0) 1634 1635 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4 1636 VOR T2L, T2L, X0 1637 VOR T2H, T2H, X1 1638 VOR T2L, T2L, Y0 1639 VOR T2H, T2H, Y1 1640 CALL p256MulInternal<>(SB) 1641 1642 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3) 1643 p256SubInternal(T1,T0,T1,T0,T1H,T1L) 1644 1645 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 1646 p256SubInternal(T1,T0,T1,T0,T4H,T4L) 1647 VOR T0, T0, X3L 1648 VOR T1, T1, X3H 1649 1650 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 1651 p256SubInternal(X1,X0,T3H,T3L,T1,T0) 1652 1653 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4 1654 CALL p256MulInternal<>(SB) 1655 VOR T0, T0, T3L 1656 VOR T1, T1, T3H 1657 1658 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 1659 VOR T4L, T4L, X0 1660 VOR T4H, T4H, X1 1661 MOVD in1+8(FP), P1ptr 1662 LXVD2X (R8)(R0), SWAP_ 1663 LXVD2X (R17)(P1ptr), Y0_ // Y1H 1664 LXVD2X (R18)(P1ptr), Y1_ // Y1L 1665 VPERM Y0, Y0, SWAP, Y0 1666 VPERM Y1, Y1, SWAP, Y1 1667 CALL p256MulInternal<>(SB) 1668 1669 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3) 1670 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0) 1671 1672 // if (sel == 0) { 1673 // copy(P3.x[:], X1) 1674 // copy(P3.y[:], Y1) 1675 // copy(P3.z[:], Z1) 1676 // } 1677 1678 LXVD2X (R8)(R0), SWAP_ 1679 LXVD2X (R0)(P1ptr), X1L_ 1680 LXVD2X (R16)(P1ptr), X1H_ 1681 VPERM X1H, X1H, SWAP, X1H 1682 VPERM X1L, X1L, SWAP, X1L 1683 1684 // Y1 already loaded, left over from addition 1685 LXVD2X (R19)(P1ptr), Z1L_ 1686 LXVD2X (R20)(P1ptr), Z1H_ 1687 VPERM Z1H, Z1H, SWAP, Z1H 1688 VPERM Z1L, Z1L, SWAP, Z1L 1689 1690 MOVD $112, R26 // Get offset to sel+32 1691 LXVDSX (R1)(R26), SEL1_ 1692 VSPLTISB $0, ZER 1693 VCMPEQUD SEL1, ZER, SEL1 1694 1695 VSEL X3L, X1L, SEL1, X3L 1696 VSEL X3H, X1H, SEL1, X3H 1697 VSEL Y3L, Y1L, SEL1, Y3L 1698 VSEL Y3H, Y1H, SEL1, Y3H 1699 VSEL Z3L, Z1L, SEL1, Z3L 1700 VSEL Z3H, Z1H, SEL1, Z3H 1701 1702 // if (zero == 0) { 1703 // copy(P3.x[:], X2) 1704 // copy(P3.y[:], Y2) 1705 // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1706 // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p 1707 // } 1708 MOVD in2+16(FP), P2ptr 1709 LXVD2X (R0)(P2ptr), X2L_ 1710 LXVD2X (R16)(P2ptr), X2H_ 1711 VPERM X2H, X2H, SWAP, X2H 1712 VPERM X2L, X2L, SWAP, X2L 1713 1714 // Y2 already loaded 1715 LXVD2X (R23)(CPOOL), Z2L_ 1716 LXVD2X (R24)(CPOOL), Z2H_ 1717 1718 MOVD $120, R26 // Get the value from zero+40(FP) 1719 LXVDSX (R1)(R26), SEL1_ 1720 VSPLTISB $0, ZER 1721 VCMPEQUD SEL1, ZER, SEL1 1722 1723 VSEL X3L, X2L, SEL1, X3L 1724 VSEL X3H, X2H, SEL1, X3H 1725 VSEL Y3L, Y2L, SEL1, Y3L 1726 VSEL Y3H, Y2H, SEL1, Y3H 1727 VSEL Z3L, Z2L, SEL1, Z3L 1728 VSEL Z3H, Z2H, SEL1, Z3H 1729 1730 // Reorder the bytes so they can be stored using STXVD2X. 1731 MOVD res+0(FP), P3ptr 1732 VPERM X3H, X3H, SWAP, X3H 1733 VPERM X3L, X3L, SWAP, X3L 1734 VPERM Y3H, Y3H, SWAP, Y3H 1735 VPERM Y3L, Y3L, SWAP, Y3L 1736 VPERM Z3H, Z3H, SWAP, Z3H 1737 VPERM Z3L, Z3L, SWAP, Z3L 1738 STXVD2X X3L_, (R0)(P3ptr) 1739 STXVD2X X3H_, (R16)(P3ptr) 1740 STXVD2X Y3L_, (R17)(P3ptr) 1741 STXVD2X Y3H_, (R18)(P3ptr) 1742 STXVD2X Z3L_, (R19)(P3ptr) 1743 STXVD2X Z3H_, (R20)(P3ptr) 1744 1745 RET 1746 1747 #undef P3ptr 1748 #undef P1ptr 1749 #undef P2ptr 1750 #undef CPOOL 1751 #undef SWAP 1752 #undef SWAP_ 1753 1754 #undef Y2L 1755 #undef Y2H 1756 #undef Y2L_ 1757 #undef Y2H_ 1758 #undef T1L 1759 #undef T1H 1760 #undef T2L 1761 #undef T2H 1762 #undef T3L 1763 #undef T3H 1764 #undef T4L 1765 #undef T4H 1766 1767 #undef TT0 1768 #undef TT1 1769 #undef TT0_ 1770 #undef TT1_ 1771 #undef T2 1772 1773 #undef X0 1774 #undef X1 1775 #undef X0_ 1776 #undef X1_ 1777 #undef Y0 1778 #undef Y1 1779 #undef Y0_ 1780 #undef Y1_ 1781 #undef T0 1782 #undef T1 1783 1784 #undef PL 1785 #undef PH 1786 #undef PL_ 1787 #undef PH_ 1788 1789 #undef X1L 1790 #undef X1H 1791 #undef X1L_ 1792 #undef X1H_ 1793 #undef Y1L 1794 #undef Y1H 1795 #undef Y1L_ 1796 #undef Y1H_ 1797 #undef Z1L 1798 #undef Z1H 1799 #undef Z1L_ 1800 #undef Z1H_ 1801 #undef X2L 1802 #undef X2H 1803 #undef X2L_ 1804 #undef X2H_ 1805 #undef Z2L 1806 #undef Z2H 1807 #undef Z2L_ 1808 #undef Z2H_ 1809 #undef X3L 1810 #undef X3H 1811 #undef X3L_ 1812 #undef X3H_ 1813 #undef Y3L 1814 #undef Y3H 1815 #undef Y3L_ 1816 #undef Y3H_ 1817 #undef Z3L 1818 #undef Z3H 1819 #undef Z3L_ 1820 #undef Z3H_ 1821 1822 #undef ZER 1823 #undef SEL1 1824 #undef SEL1_ 1825 #undef CAR1 1826 #undef CAR2 1827 1828 // p256PointDoubleAsm(P3, P1 *p256Point) 1829 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl 1830 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html 1831 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html 1832 #define P3ptr R3 1833 #define P1ptr R4 1834 #define CPOOL R7 1835 1836 // Temporaries in REGs 1837 #define X3L V15 1838 #define X3H V16 1839 #define X3L_ VS47 1840 #define X3H_ VS48 1841 #define Y3L V17 1842 #define Y3H V18 1843 #define Y3L_ VS49 1844 #define Y3H_ VS50 1845 #define T1L V19 1846 #define T1H V20 1847 #define T2L V21 1848 #define T2H V22 1849 #define T3L V23 1850 #define T3H V24 1851 1852 #define X1L V6 1853 #define X1H V7 1854 #define X1L_ VS38 1855 #define X1H_ VS39 1856 #define Y1L V8 1857 #define Y1H V9 1858 #define Y1L_ VS40 1859 #define Y1H_ VS41 1860 #define Z1L V10 1861 #define Z1H V11 1862 1863 // Temps for Sub and Add 1864 #define TT0 V11 1865 #define TT1 V12 1866 #define TT0_ VS43 1867 #define TT1_ VS44 1868 #define T2 V13 1869 1870 // p256MulAsm Parameters 1871 #define X0 V0 1872 #define X1 V1 1873 #define X0_ VS32 1874 #define X1_ VS33 1875 #define Y0 V2 1876 #define Y1 V3 1877 #define Y0_ VS34 1878 #define Y1_ VS35 1879 #define T0 V4 1880 #define T1 V5 1881 #define T0_ VS36 1882 #define T1_ VS37 1883 1884 #define PL V30 1885 #define PH V31 1886 #define PL_ VS62 1887 #define PH_ VS63 1888 1889 #define Z3L V23 1890 #define Z3H V24 1891 1892 #define SWAP V25 1893 #define SWAP_ VS57 1894 #define ZER V26 1895 #define SEL1 V27 1896 #define CAR1 V28 1897 #define CAR2 V29 1898 /* 1899 * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv 1900 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3. 1901 * Source: 2004 Hankerson–Menezes–Vanstone, page 91. 1902 * A = 3(X₁-Z₁²)×(X₁+Z₁²) 1903 * B = 2Y₁ 1904 * Z₃ = B×Z₁ 1905 * C = B² 1906 * D = C×X₁ 1907 * X₃ = A²-2D 1908 * Y₃ = (D-X₃)×A-C²/2 1909 * 1910 * Three-operand formula: 1911 * T1 = Z1² 1912 * T2 = X1-T1 1913 * T1 = X1+T1 1914 * T2 = T2*T1 1915 * T2 = 3*T2 1916 * Y3 = 2*Y1 1917 * Z3 = Y3*Z1 1918 * Y3 = Y3² 1919 * T3 = Y3*X1 1920 * Y3 = Y3² 1921 * Y3 = half*Y3 1922 * X3 = T2² 1923 * T1 = 2*T3 1924 * X3 = X3-T1 1925 * T1 = T3-X3 1926 * T1 = T1*T2 1927 * Y3 = T1-Y3 1928 */ 1929 1930 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0-16 1931 MOVD res+0(FP), P3ptr 1932 MOVD in+8(FP), P1ptr 1933 1934 MOVD $p256mul<>+0x00(SB), CPOOL 1935 MOVD $byteswap<>+0x00(SB), R15 1936 1937 MOVD $16, R16 1938 MOVD $32, R17 1939 MOVD $48, R18 1940 MOVD $64, R19 1941 MOVD $80, R20 1942 1943 LXVD2X (R16)(CPOOL), PH_ 1944 LXVD2X (R0)(CPOOL), PL_ 1945 1946 LXVD2X (R15)(R0), SWAP_ 1947 1948 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² 1949 LXVD2X (R19)(P1ptr), X0_ // Z1H 1950 LXVD2X (R20)(P1ptr), X1_ // Z1L 1951 1952 VPERM X0, X0, SWAP, X0 1953 VPERM X1, X1, SWAP, X1 1954 1955 VOR X0, X0, Y0 1956 VOR X1, X1, Y1 1957 CALL p256MulInternal<>(SB) 1958 1959 // SUB(X<X1-T) // T2 = X1-T1 1960 LXVD2X (R0)(P1ptr), X1L_ 1961 LXVD2X (R16)(P1ptr), X1H_ 1962 VPERM X1L, X1L, SWAP, X1L 1963 VPERM X1H, X1H, SWAP, X1H 1964 1965 p256SubInternal(X1,X0,X1H,X1L,T1,T0) 1966 1967 // ADD(Y<X1+T) // T1 = X1+T1 1968 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0) 1969 1970 // X- ; Y- ; MUL; T- // T2 = T2*T1 1971 CALL p256MulInternal<>(SB) 1972 1973 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2 1974 p256AddInternal(T2H,T2L,T1,T0,T1,T0) 1975 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0) 1976 1977 // ADD(X<Y1+Y1) // Y3 = 2*Y1 1978 LXVD2X (R15)(R0), SWAP_ 1979 LXVD2X (R17)(P1ptr), Y1L_ 1980 LXVD2X (R18)(P1ptr), Y1H_ 1981 VPERM Y1L, Y1L, SWAP, Y1L 1982 VPERM Y1H, Y1H, SWAP, Y1H 1983 1984 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L) 1985 1986 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1 1987 LXVD2X (R15)(R0), SWAP_ 1988 LXVD2X (R19)(P1ptr), Y0_ 1989 LXVD2X (R20)(P1ptr), Y1_ 1990 VPERM Y0, Y0, SWAP, Y0 1991 VPERM Y1, Y1, SWAP, Y1 1992 1993 CALL p256MulInternal<>(SB) 1994 1995 LXVD2X (R15)(R0), SWAP_ 1996 1997 // Leave T0, T1 as is. 1998 VPERM T0, T0, SWAP, TT0 1999 VPERM T1, T1, SWAP, TT1 2000 STXVD2X TT0_, (R19)(P3ptr) 2001 STXVD2X TT1_, (R20)(P3ptr) 2002 2003 // X- ; Y=X ; MUL; T- // Y3 = Y3² 2004 VOR X0, X0, Y0 2005 VOR X1, X1, Y1 2006 CALL p256MulInternal<>(SB) 2007 2008 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 2009 VOR T0, T0, X0 2010 VOR T1, T1, X1 2011 LXVD2X (R15)(R0), SWAP_ 2012 LXVD2X (R0)(P1ptr), Y0_ 2013 LXVD2X (R16)(P1ptr), Y1_ 2014 VPERM Y0, Y0, SWAP, Y0 2015 VPERM Y1, Y1, SWAP, Y1 2016 CALL p256MulInternal<>(SB) 2017 VOR T0, T0, T3L 2018 VOR T1, T1, T3H 2019 2020 // X- ; Y=X ; MUL; T- // Y3 = Y3² 2021 VOR X0, X0, Y0 2022 VOR X1, X1, Y1 2023 CALL p256MulInternal<>(SB) 2024 2025 // HAL(Y3<T) // Y3 = half*Y3 2026 p256HalfInternal(Y3H,Y3L, T1,T0) 2027 2028 // X=T2; Y=T2; MUL; T- // X3 = T2² 2029 VOR T2L, T2L, X0 2030 VOR T2H, T2H, X1 2031 VOR T2L, T2L, Y0 2032 VOR T2H, T2H, Y1 2033 CALL p256MulInternal<>(SB) 2034 2035 // ADD(T1<T3+T3) // T1 = 2*T3 2036 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L) 2037 2038 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1 2039 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L) 2040 2041 LXVD2X (R15)(R0), SWAP_ 2042 VPERM X3L, X3L, SWAP, TT0 2043 VPERM X3H, X3H, SWAP, TT1 2044 STXVD2X TT0_, (R0)(P3ptr) 2045 STXVD2X TT1_, (R16)(P3ptr) 2046 2047 // SUB(X<T3-X3) // T1 = T3-X3 2048 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L) 2049 2050 // X- ; Y- ; MUL; T- // T1 = T1*T2 2051 CALL p256MulInternal<>(SB) 2052 2053 // SUB(Y3<T-Y3) // Y3 = T1-Y3 2054 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L) 2055 2056 LXVD2X (R15)(R0), SWAP_ 2057 VPERM Y3L, Y3L, SWAP, Y3L 2058 VPERM Y3H, Y3H, SWAP, Y3H 2059 STXVD2X Y3L_, (R17)(P3ptr) 2060 STXVD2X Y3H_, (R18)(P3ptr) 2061 RET 2062 2063 #undef P3ptr 2064 #undef P1ptr 2065 #undef CPOOL 2066 #undef X3L 2067 #undef X3H 2068 #undef X3L_ 2069 #undef X3H_ 2070 #undef Y3L 2071 #undef Y3H 2072 #undef Y3L_ 2073 #undef Y3H_ 2074 #undef T1L 2075 #undef T1H 2076 #undef T2L 2077 #undef T2H 2078 #undef T3L 2079 #undef T3H 2080 #undef X1L 2081 #undef X1H 2082 #undef X1L_ 2083 #undef X1H_ 2084 #undef Y1L 2085 #undef Y1H 2086 #undef Y1L_ 2087 #undef Y1H_ 2088 #undef Z1L 2089 #undef Z1H 2090 #undef TT0 2091 #undef TT1 2092 #undef TT0_ 2093 #undef TT1_ 2094 #undef T2 2095 #undef X0 2096 #undef X1 2097 #undef X0_ 2098 #undef X1_ 2099 #undef Y0 2100 #undef Y1 2101 #undef Y0_ 2102 #undef Y1_ 2103 #undef T0 2104 #undef T1 2105 #undef T0_ 2106 #undef T1_ 2107 #undef PL 2108 #undef PH 2109 #undef PL_ 2110 #undef PH_ 2111 #undef Z3L 2112 #undef Z3H 2113 #undef ZER 2114 #undef SEL1 2115 #undef CAR1 2116 #undef CAR2 2117 #undef SWAP 2118 #undef SWAP_ 2119 2120 // p256PointAddAsm(P3, P1, P2 *p256Point) 2121 #define P3ptr R3 2122 #define P1ptr R4 2123 #define P2ptr R5 2124 #define CPOOL R7 2125 #define TRUE R14 2126 #define RES1 R9 2127 #define RES2 R10 2128 2129 // Temporaries in REGs 2130 #define T1L V16 2131 #define T1H V17 2132 #define T2L V18 2133 #define T2H V19 2134 #define U1L V20 2135 #define U1H V21 2136 #define S1L V22 2137 #define S1H V23 2138 #define HL V24 2139 #define HH V25 2140 #define RL V26 2141 #define RH V27 2142 #define RH_ VS59 2143 2144 // Temps for Sub and Add 2145 #define ZER V6 2146 #define SEL1 V7 2147 #define CAR1 V8 2148 #define CAR2 V9 2149 #define TT0 V11 2150 #define TT0_ VS43 2151 #define TT1 V12 2152 #define TT1_ VS44 2153 #define T2 V13 2154 2155 #define SWAP V28 2156 #define SWAP_ VS60 2157 2158 // p256MulAsm Parameters 2159 #define X0 V0 2160 #define X1 V1 2161 #define X0_ VS32 2162 #define X1_ VS33 2163 #define Y0 V2 2164 #define Y1 V3 2165 #define Y0_ VS34 2166 #define Y1_ VS35 2167 #define T0 V4 2168 #define T1 V5 2169 #define T0_ VS36 2170 #define T1_ VS37 2171 2172 #define PL V30 2173 #define PH V31 2174 #define PL_ VS62 2175 #define PH_ VS63 2176 /* 2177 * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields" 2178 * 2179 * A = X₁×Z₂² 2180 * B = Y₁×Z₂³ 2181 * C = X₂×Z₁²-A 2182 * D = Y₂×Z₁³-B 2183 * X₃ = D² - 2A×C² - C³ 2184 * Y₃ = D×(A×C² - X₃) - B×C³ 2185 * Z₃ = Z₁×Z₂×C 2186 * 2187 * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2 2188 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R 2189 * 2190 * T1 = Z1*Z1 2191 * T2 = Z2*Z2 2192 * U1 = X1*T2 2193 * H = X2*T1 2194 * H = H-U1 2195 * Z3 = Z1*Z2 2196 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 2197 * 2198 * S1 = Z2*T2 2199 * S1 = Y1*S1 2200 * R = Z1*T1 2201 * R = Y2*R 2202 * R = R-S1 2203 * 2204 * T1 = H*H 2205 * T2 = H*T1 2206 * U1 = U1*T1 2207 * 2208 * X3 = R*R 2209 * X3 = X3-T2 2210 * T1 = 2*U1 2211 * X3 = X3-T1 << store-out X3 result reg 2212 * 2213 * T2 = S1*T2 2214 * Y3 = U1-X3 2215 * Y3 = R*Y3 2216 * Y3 = Y3-T2 << store-out Y3 result reg 2217 2218 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 2219 // X- ; Y=T ; MUL; R=T // R = Z1*T1 2220 // X=X2; Y- ; MUL; H=T // H = X2*T1 2221 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2222 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2223 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2224 // SUB(H<H-T) // H = H-U1 2225 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2226 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array 2227 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2228 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2229 // SUB(R<T-S1) // R = R-S1 2230 // X=H ; Y=H ; MUL; T- // T1 = H*H 2231 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2232 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2233 // X=R ; Y=R ; MUL; T- // X3 = R*R 2234 // SUB(T<T-T2) // X3 = X3-T2 2235 // ADD(X<U1+U1) // T1 = 2*U1 2236 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2237 // SUB(Y<U1-T) // Y3 = U1-X3 2238 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2239 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2240 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2241 */ 2242 TEXT ·p256PointAddAsm(SB), NOSPLIT, $16-32 2243 MOVD res+0(FP), P3ptr 2244 MOVD in1+8(FP), P1ptr 2245 MOVD $p256mul<>+0x00(SB), CPOOL 2246 MOVD $16, R16 2247 MOVD $32, R17 2248 MOVD $48, R18 2249 MOVD $64, R19 2250 MOVD $80, R20 2251 2252 MOVD $byteswap<>+0x00(SB), R8 2253 LXVD2X (R16)(CPOOL), PH_ 2254 LXVD2X (R0)(CPOOL), PL_ 2255 2256 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 2257 LXVD2X (R8)(R0), SWAP_ 2258 LXVD2X (R19)(P1ptr), X0_ // Z1L 2259 LXVD2X (R20)(P1ptr), X1_ // Z1H 2260 VPERM X0, X0, SWAP, X0 2261 VPERM X1, X1, SWAP, X1 2262 VOR X0, X0, Y0 2263 VOR X1, X1, Y1 2264 CALL p256MulInternal<>(SB) 2265 2266 // X- ; Y=T ; MUL; R=T // R = Z1*T1 2267 VOR T0, T0, Y0 2268 VOR T1, T1, Y1 2269 CALL p256MulInternal<>(SB) 2270 VOR T0, T0, RL // SAVE: RL 2271 VOR T1, T1, RH // SAVE: RH 2272 2273 STXVD2X RH_, (R1)(R17) // V27 has to be saved 2274 2275 // X=X2; Y- ; MUL; H=T // H = X2*T1 2276 MOVD in2+16(FP), P2ptr 2277 LXVD2X (R8)(R0), SWAP_ 2278 LXVD2X (R0)(P2ptr), X0_ // X2L 2279 LXVD2X (R16)(P2ptr), X1_ // X2H 2280 VPERM X0, X0, SWAP, X0 2281 VPERM X1, X1, SWAP, X1 2282 CALL p256MulInternal<>(SB) 2283 VOR T0, T0, HL // SAVE: HL 2284 VOR T1, T1, HH // SAVE: HH 2285 2286 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 2287 MOVD in2+16(FP), P2ptr 2288 LXVD2X (R8)(R0), SWAP_ 2289 LXVD2X (R19)(P2ptr), X0_ // Z2L 2290 LXVD2X (R20)(P2ptr), X1_ // Z2H 2291 VPERM X0, X0, SWAP, X0 2292 VPERM X1, X1, SWAP, X1 2293 VOR X0, X0, Y0 2294 VOR X1, X1, Y1 2295 CALL p256MulInternal<>(SB) 2296 2297 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 2298 VOR T0, T0, Y0 2299 VOR T1, T1, Y1 2300 CALL p256MulInternal<>(SB) 2301 VOR T0, T0, S1L // SAVE: S1L 2302 VOR T1, T1, S1H // SAVE: S1H 2303 2304 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 2305 MOVD in1+8(FP), P1ptr 2306 LXVD2X (R8)(R0), SWAP_ 2307 LXVD2X (R0)(P1ptr), X0_ // X1L 2308 LXVD2X (R16)(P1ptr), X1_ // X1H 2309 VPERM X0, X0, SWAP, X0 2310 VPERM X1, X1, SWAP, X1 2311 CALL p256MulInternal<>(SB) 2312 VOR T0, T0, U1L // SAVE: U1L 2313 VOR T1, T1, U1H // SAVE: U1H 2314 2315 // SUB(H<H-T) // H = H-U1 2316 p256SubInternal(HH,HL,HH,HL,T1,T0) 2317 2318 // if H == 0 or H^P == 0 then ret=1 else ret=0 2319 // clobbers T1H and T1L 2320 MOVD $1, TRUE 2321 VSPLTISB $0, ZER 2322 VOR HL, HH, T1H 2323 VCMPEQUDCC ZER, T1H, T1H 2324 2325 // 26 = CR6 NE 2326 ISEL $26, R0, TRUE, RES1 2327 VXOR HL, PL, T1L // SAVE: T1L 2328 VXOR HH, PH, T1H // SAVE: T1H 2329 VOR T1L, T1H, T1H 2330 VCMPEQUDCC ZER, T1H, T1H 2331 2332 // 26 = CR6 NE 2333 ISEL $26, R0, TRUE, RES2 2334 OR RES2, RES1, RES1 2335 MOVD RES1, ret+24(FP) 2336 2337 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 2338 MOVD $byteswap<>+0x00(SB), R8 2339 MOVD in1+8(FP), P1ptr 2340 MOVD in2+16(FP), P2ptr 2341 LXVD2X (R8)(R0), SWAP_ 2342 LXVD2X (R19)(P1ptr), X0_ // Z1L 2343 LXVD2X (R20)(P1ptr), X1_ // Z1H 2344 VPERM X0, X0, SWAP, X0 2345 VPERM X1, X1, SWAP, X1 2346 LXVD2X (R19)(P2ptr), Y0_ // Z2L 2347 LXVD2X (R20)(P2ptr), Y1_ // Z2H 2348 VPERM Y0, Y0, SWAP, Y0 2349 VPERM Y1, Y1, SWAP, Y1 2350 CALL p256MulInternal<>(SB) 2351 2352 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H 2353 VOR T0, T0, X0 2354 VOR T1, T1, X1 2355 VOR HL, HL, Y0 2356 VOR HH, HH, Y1 2357 CALL p256MulInternal<>(SB) 2358 MOVD res+0(FP), P3ptr 2359 LXVD2X (R8)(R0), SWAP_ 2360 VPERM T1, T1, SWAP, TT1 2361 VPERM T0, T0, SWAP, TT0 2362 STXVD2X TT0_, (R19)(P3ptr) 2363 STXVD2X TT1_, (R20)(P3ptr) 2364 2365 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 2366 MOVD in1+8(FP), P1ptr 2367 LXVD2X (R17)(P1ptr), X0_ 2368 LXVD2X (R18)(P1ptr), X1_ 2369 VPERM X0, X0, SWAP, X0 2370 VPERM X1, X1, SWAP, X1 2371 VOR S1L, S1L, Y0 2372 VOR S1H, S1H, Y1 2373 CALL p256MulInternal<>(SB) 2374 VOR T0, T0, S1L 2375 VOR T1, T1, S1H 2376 2377 // X=Y2; Y=R ; MUL; T- // R = Y2*R 2378 MOVD in2+16(FP), P2ptr 2379 LXVD2X (R8)(R0), SWAP_ 2380 LXVD2X (R17)(P2ptr), X0_ 2381 LXVD2X (R18)(P2ptr), X1_ 2382 VPERM X0, X0, SWAP, X0 2383 VPERM X1, X1, SWAP, X1 2384 VOR RL, RL, Y0 2385 2386 // VOR RH, RH, Y1 RH was saved above in D2X format 2387 LXVD2X (R1)(R17), Y1_ 2388 CALL p256MulInternal<>(SB) 2389 2390 // SUB(R<T-S1) // R = T-S1 2391 p256SubInternal(RH,RL,T1,T0,S1H,S1L) 2392 2393 STXVD2X RH_, (R1)(R17) // Save RH 2394 2395 // if R == 0 or R^P == 0 then ret=ret else ret=0 2396 // clobbers T1H and T1L 2397 // Redo this using ISEL?? 2398 MOVD $1, TRUE 2399 VSPLTISB $0, ZER 2400 VOR RL, RH, T1H 2401 VCMPEQUDCC ZER, T1H, T1H 2402 2403 // 24 = CR6 NE 2404 ISEL $26, R0, TRUE, RES1 2405 VXOR RL, PL, T1L 2406 VXOR RH, PH, T1H // SAVE: T1L 2407 VOR T1L, T1H, T1H 2408 VCMPEQUDCC ZER, T1H, T1H 2409 2410 // 26 = CR6 NE 2411 ISEL $26, R0, TRUE, RES2 2412 OR RES2, RES1, RES1 2413 MOVD ret+24(FP), RES2 2414 AND RES2, RES1, RES1 2415 MOVD RES1, ret+24(FP) 2416 2417 // X=H ; Y=H ; MUL; T- // T1 = H*H 2418 VOR HL, HL, X0 2419 VOR HH, HH, X1 2420 VOR HL, HL, Y0 2421 VOR HH, HH, Y1 2422 CALL p256MulInternal<>(SB) 2423 2424 // X- ; Y=T ; MUL; T2=T // T2 = H*T1 2425 VOR T0, T0, Y0 2426 VOR T1, T1, Y1 2427 CALL p256MulInternal<>(SB) 2428 VOR T0, T0, T2L 2429 VOR T1, T1, T2H 2430 2431 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 2432 VOR U1L, U1L, X0 2433 VOR U1H, U1H, X1 2434 CALL p256MulInternal<>(SB) 2435 VOR T0, T0, U1L 2436 VOR T1, T1, U1H 2437 2438 // X=R ; Y=R ; MUL; T- // X3 = R*R 2439 VOR RL, RL, X0 2440 2441 // VOR RH, RH, X1 2442 VOR RL, RL, Y0 2443 2444 // RH was saved above using STXVD2X 2445 LXVD2X (R1)(R17), X1_ 2446 VOR X1, X1, Y1 2447 2448 // VOR RH, RH, Y1 2449 CALL p256MulInternal<>(SB) 2450 2451 // SUB(T<T-T2) // X3 = X3-T2 2452 p256SubInternal(T1,T0,T1,T0,T2H,T2L) 2453 2454 // ADD(X<U1+U1) // T1 = 2*U1 2455 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L) 2456 2457 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg 2458 p256SubInternal(T1,T0,T1,T0,X1,X0) 2459 MOVD res+0(FP), P3ptr 2460 LXVD2X (R8)(R0), SWAP_ 2461 VPERM T1, T1, SWAP, TT1 2462 VPERM T0, T0, SWAP, TT0 2463 STXVD2X TT0_, (R0)(P3ptr) 2464 STXVD2X TT1_, (R16)(P3ptr) 2465 2466 // SUB(Y<U1-T) // Y3 = U1-X3 2467 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0) 2468 2469 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 2470 VOR RL, RL, X0 2471 2472 // VOR RH, RH, X1 2473 LXVD2X (R1)(R17), X1_ 2474 CALL p256MulInternal<>(SB) 2475 VOR T0, T0, U1L 2476 VOR T1, T1, U1H 2477 2478 // X=S1; Y=T2; MUL; T- // T2 = S1*T2 2479 VOR S1L, S1L, X0 2480 VOR S1H, S1H, X1 2481 VOR T2L, T2L, Y0 2482 VOR T2H, T2H, Y1 2483 CALL p256MulInternal<>(SB) 2484 2485 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg 2486 p256SubInternal(T1,T0,U1H,U1L,T1,T0) 2487 MOVD res+0(FP), P3ptr 2488 LXVD2X (R8)(R0), SWAP_ 2489 VPERM T1, T1, SWAP, TT1 2490 VPERM T0, T0, SWAP, TT0 2491 STXVD2X TT0_, (R17)(P3ptr) 2492 STXVD2X TT1_, (R18)(P3ptr) 2493 2494 RET